LLVM 18.1.0rc
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
227 // TODO: Could make these legal
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236
238 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
239 }
240
241 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
242 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
243 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
244 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
245 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
246 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
247 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
248 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
249 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
250 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
251 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
252 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
253 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
254 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
255 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
256 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
257
258 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
259 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
260 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
264 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
265
266 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
267
271 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
272
273 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
274
276 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
277
279 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
280 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
281
283 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
284 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
285 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
286 Expand);
288 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
289 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
290 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
291 Expand);
292
294 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
295 MVT::v3i16, MVT::v4i16, MVT::Other},
296 Custom);
297
300 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
301
303
305
307 Expand);
308
309#if 0
311#endif
312
313 // We only support LOAD/STORE and vector manipulation ops for vectors
314 // with > 4 elements.
315 for (MVT VT :
316 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
317 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
318 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
319 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
320 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
321 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
322 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
323 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
324 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
325 switch (Op) {
326 case ISD::LOAD:
327 case ISD::STORE:
329 case ISD::BITCAST:
330 case ISD::UNDEF:
334 case ISD::IS_FPCLASS:
335 break;
340 break;
341 default:
343 break;
344 }
345 }
346 }
347
349
350 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
351 // is expanded to avoid having two separate loops in case the index is a VGPR.
352
353 // Most operations are naturally 32-bit vector operations. We only support
354 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
355 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
357 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
367 }
368
369 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
371 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
381 }
382
383 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
385 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
395 }
396
397 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
399 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
409 }
410
411 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
413 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
423 }
424
426 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
427 Expand);
428
429 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
430 Custom);
431
432 // Avoid stack access for these.
433 // TODO: Generalize to more vector types.
435 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
436 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 Custom);
438
439 // Deal with vec3 vector operations when widened to vec4.
441 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
442
443 // Deal with vec5/6/7 vector operations when widened to vec8.
445 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
446 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
447 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
448 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
449 Custom);
450
451 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
452 // and output demarshalling
453 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
454
455 // We can't return success/failure, only the old value,
456 // let LLVM add the comparison
458 Expand);
459
460 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
461
462 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
463
464 // FIXME: This should be narrowed to i32, but that only happens if i64 is
465 // illegal.
466 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
467 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
468
469 // On SI this is s_memtime and s_memrealtime on VI.
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
633 case ISD::IS_FPCLASS:
634 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 if (!Subtarget->hasVOP3PInsts())
745 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax())
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
855
857 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
858 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
859 Custom);
860
862 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
863 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
864 MVT::i16, MVT::i8, MVT::i128},
865 Custom);
866
868 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
869 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
870 MVT::i8, MVT::i128},
871 Custom);
872
875
876 // TODO: Could move this to custom lowering, could benefit from combines on
877 // extract of relevant bits.
879
881
884 ISD::SUB,
886 ISD::FADD,
887 ISD::FSUB,
888 ISD::FDIV,
895 ISD::FMA,
896 ISD::SMIN,
897 ISD::SMAX,
898 ISD::UMIN,
899 ISD::UMAX,
901 ISD::AND,
902 ISD::OR,
903 ISD::XOR,
904 ISD::FSHR,
914
915 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
917
918 // All memory operations. Some folding on the pointer operand is done to help
919 // matching the constant offsets in the addressing modes.
942
943 // FIXME: In other contexts we pretend this is a per-function property.
945
947}
948
950 return Subtarget;
951}
952
953//===----------------------------------------------------------------------===//
954// TargetLowering queries
955//===----------------------------------------------------------------------===//
956
957// v_mad_mix* support a conversion from f16 to f32.
958//
959// There is only one special case when denormals are enabled we don't currently,
960// where this is OK to use.
961bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
962 EVT DestVT, EVT SrcVT) const {
963 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
964 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
965 DestVT.getScalarType() == MVT::f32 &&
966 SrcVT.getScalarType() == MVT::f16 &&
967 // TODO: This probably only requires no input flushing?
969}
970
972 LLT DestTy, LLT SrcTy) const {
973 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
974 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
975 DestTy.getScalarSizeInBits() == 32 &&
976 SrcTy.getScalarSizeInBits() == 16 &&
977 // TODO: This probably only requires no input flushing?
979}
980
982 // SI has some legal vector types, but no legal vector operations. Say no
983 // shuffles are legal in order to prefer scalarizing some vector operations.
984 return false;
985}
986
989 EVT VT) const {
992
993 if (VT.isVector()) {
994 EVT ScalarVT = VT.getScalarType();
995 unsigned Size = ScalarVT.getSizeInBits();
996 if (Size == 16) {
997 if (Subtarget->has16BitInsts()) {
998 if (VT.isInteger())
999 return MVT::v2i16;
1000 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1001 }
1002 return VT.isInteger() ? MVT::i32 : MVT::f32;
1003 }
1004
1005 if (Size < 16)
1006 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1007 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1008 }
1009
1010 if (VT.getSizeInBits() > 32)
1011 return MVT::i32;
1012
1014}
1015
1018 EVT VT) const {
1021
1022 if (VT.isVector()) {
1023 unsigned NumElts = VT.getVectorNumElements();
1024 EVT ScalarVT = VT.getScalarType();
1025 unsigned Size = ScalarVT.getSizeInBits();
1026
1027 // FIXME: Should probably promote 8-bit vectors to i16.
1028 if (Size == 16 && Subtarget->has16BitInsts())
1029 return (NumElts + 1) / 2;
1030
1031 if (Size <= 32)
1032 return NumElts;
1033
1034 if (Size > 32)
1035 return NumElts * ((Size + 31) / 32);
1036 } else if (VT.getSizeInBits() > 32)
1037 return (VT.getSizeInBits() + 31) / 32;
1038
1040}
1041
1043 LLVMContext &Context, CallingConv::ID CC,
1044 EVT VT, EVT &IntermediateVT,
1045 unsigned &NumIntermediates, MVT &RegisterVT) const {
1046 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1047 unsigned NumElts = VT.getVectorNumElements();
1048 EVT ScalarVT = VT.getScalarType();
1049 unsigned Size = ScalarVT.getSizeInBits();
1050 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1051 // support, but unless we can properly handle 3-vectors, it will be still be
1052 // inconsistent.
1053 if (Size == 16 && Subtarget->has16BitInsts()) {
1054 if (ScalarVT == MVT::bf16) {
1055 RegisterVT = MVT::i32;
1056 IntermediateVT = MVT::v2bf16;
1057 } else {
1058 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1059 IntermediateVT = RegisterVT;
1060 }
1061 NumIntermediates = (NumElts + 1) / 2;
1062 return NumIntermediates;
1063 }
1064
1065 if (Size == 32) {
1066 RegisterVT = ScalarVT.getSimpleVT();
1067 IntermediateVT = RegisterVT;
1068 NumIntermediates = NumElts;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size < 16 && Subtarget->has16BitInsts()) {
1073 // FIXME: Should probably form v2i16 pieces
1074 RegisterVT = MVT::i16;
1075 IntermediateVT = ScalarVT;
1076 NumIntermediates = NumElts;
1077 return NumIntermediates;
1078 }
1079
1080
1081 if (Size != 16 && Size <= 32) {
1082 RegisterVT = MVT::i32;
1083 IntermediateVT = ScalarVT;
1084 NumIntermediates = NumElts;
1085 return NumIntermediates;
1086 }
1087
1088 if (Size > 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = RegisterVT;
1091 NumIntermediates = NumElts * ((Size + 31) / 32);
1092 return NumIntermediates;
1093 }
1094 }
1095
1097 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1098}
1099
1100static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1101 assert(MaxNumLanes != 0);
1102
1103 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1104 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1105 return EVT::getVectorVT(Ty->getContext(),
1106 EVT::getEVT(VT->getElementType()),
1107 NumElts);
1108 }
1109
1110 return EVT::getEVT(Ty);
1111}
1112
1113// Peek through TFE struct returns to only use the data size.
1114static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1115 auto *ST = dyn_cast<StructType>(Ty);
1116 if (!ST)
1117 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1118
1119 // TFE intrinsics return an aggregate type.
1120 assert(ST->getNumContainedTypes() == 2 &&
1121 ST->getContainedType(1)->isIntegerTy(32));
1122 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1123}
1124
1125/// Map address space 7 to MVT::v5i32 because that's its in-memory
1126/// representation. This return value is vector-typed because there is no
1127/// MVT::i160 and it is not clear if one can be added. While this could
1128/// cause issues during codegen, these address space 7 pointers will be
1129/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1130/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1131/// modeling, to work.
1133 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1134 return MVT::v5i32;
1136 DL.getPointerSizeInBits(AS) == 192)
1137 return MVT::v6i32;
1139}
1140/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1141/// v8i32 when padding is added.
1142/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1143/// also v8i32 with padding.
1145 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1146 DL.getPointerSizeInBits(AS) == 160) ||
1148 DL.getPointerSizeInBits(AS) == 192))
1149 return MVT::v8i32;
1151}
1152
1154 const CallInst &CI,
1155 MachineFunction &MF,
1156 unsigned IntrID) const {
1158 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1160
1161 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1164 (Intrinsic::ID)IntrID);
1165 MemoryEffects ME = Attr.getMemoryEffects();
1166 if (ME.doesNotAccessMemory())
1167 return false;
1168
1169 // TODO: Should images get their own address space?
1170 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1171
1172 if (RsrcIntr->IsImage)
1173 Info.align.reset();
1174
1175 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1176 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1177 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1178 // We conservatively set the memory operand of a buffer intrinsic to the
1179 // base resource pointer, so that we can access alias information about
1180 // those pointers. Cases like "this points at the same value
1181 // but with a different offset" are handled in
1182 // areMemAccessesTriviallyDisjoint.
1183 Info.ptrVal = RsrcArg;
1184 }
1185
1186 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1187 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1190 if (ME.onlyReadsMemory()) {
1191 unsigned MaxNumLanes = 4;
1192
1193 if (RsrcIntr->IsImage) {
1196 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1198
1199 if (!BaseOpcode->Gather4) {
1200 // If this isn't a gather, we may have excess loaded elements in the
1201 // IR type. Check the dmask for the real number of elements loaded.
1202 unsigned DMask
1203 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1204 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1205 }
1206 }
1207
1208 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1209
1210 // FIXME: What does alignment mean for an image?
1213 } else if (ME.onlyWritesMemory()) {
1215
1216 Type *DataTy = CI.getArgOperand(0)->getType();
1217 if (RsrcIntr->IsImage) {
1218 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1219 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1220 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1221 } else
1222 Info.memVT = EVT::getEVT(DataTy);
1223
1225 } else {
1226 // Atomic
1227 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1229 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1233
1234 switch (IntrID) {
1235 default:
1236 // XXX - Should this be volatile without known ordering?
1238 break;
1239 case Intrinsic::amdgcn_raw_buffer_load_lds:
1240 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1241 case Intrinsic::amdgcn_struct_buffer_load_lds:
1242 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1243 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1244 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1245 Info.ptrVal = CI.getArgOperand(1);
1246 return true;
1247 }
1248 }
1249 }
1250 return true;
1251 }
1252
1253 switch (IntrID) {
1254 case Intrinsic::amdgcn_ds_ordered_add:
1255 case Intrinsic::amdgcn_ds_ordered_swap:
1256 case Intrinsic::amdgcn_ds_fadd:
1257 case Intrinsic::amdgcn_ds_fmin:
1258 case Intrinsic::amdgcn_ds_fmax: {
1260 Info.memVT = MVT::getVT(CI.getType());
1261 Info.ptrVal = CI.getOperand(0);
1262 Info.align.reset();
1264
1265 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1266 if (!Vol->isZero())
1268
1269 return true;
1270 }
1271 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1273 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1274 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1275 Info.align.reset();
1277
1278 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1279 if (!Vol || !Vol->isZero())
1281
1282 return true;
1283 }
1284 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1285 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1287 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1288 Info.ptrVal = nullptr;
1289 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1291 return true;
1292 }
1293 case Intrinsic::amdgcn_ds_append:
1294 case Intrinsic::amdgcn_ds_consume: {
1296 Info.memVT = MVT::getVT(CI.getType());
1297 Info.ptrVal = CI.getOperand(0);
1298 Info.align.reset();
1300
1301 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1302 if (!Vol->isZero())
1304
1305 return true;
1306 }
1307 case Intrinsic::amdgcn_global_atomic_csub: {
1309 Info.memVT = MVT::getVT(CI.getType());
1310 Info.ptrVal = CI.getOperand(0);
1311 Info.align.reset();
1315 return true;
1316 }
1317 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1319 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1320
1321 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1322 Info.align.reset();
1325 return true;
1326 }
1327 case Intrinsic::amdgcn_global_atomic_fadd:
1328 case Intrinsic::amdgcn_global_atomic_fmin:
1329 case Intrinsic::amdgcn_global_atomic_fmax:
1330 case Intrinsic::amdgcn_global_atomic_fmin_num:
1331 case Intrinsic::amdgcn_global_atomic_fmax_num:
1332 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1333 case Intrinsic::amdgcn_flat_atomic_fadd:
1334 case Intrinsic::amdgcn_flat_atomic_fmin:
1335 case Intrinsic::amdgcn_flat_atomic_fmax:
1336 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1337 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1338 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1339 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1340 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1342 Info.memVT = MVT::getVT(CI.getType());
1343 Info.ptrVal = CI.getOperand(0);
1344 Info.align.reset();
1349 return true;
1350 }
1351 case Intrinsic::amdgcn_global_load_tr: {
1353 Info.memVT = MVT::getVT(CI.getType());
1354 Info.ptrVal = CI.getOperand(0);
1355 Info.align.reset();
1357 return true;
1358 }
1359 case Intrinsic::amdgcn_ds_gws_init:
1360 case Intrinsic::amdgcn_ds_gws_barrier:
1361 case Intrinsic::amdgcn_ds_gws_sema_v:
1362 case Intrinsic::amdgcn_ds_gws_sema_br:
1363 case Intrinsic::amdgcn_ds_gws_sema_p:
1364 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1366
1367 const GCNTargetMachine &TM =
1368 static_cast<const GCNTargetMachine &>(getTargetMachine());
1369
1371 Info.ptrVal = MFI->getGWSPSV(TM);
1372
1373 // This is an abstract access, but we need to specify a type and size.
1374 Info.memVT = MVT::i32;
1375 Info.size = 4;
1376 Info.align = Align(4);
1377
1378 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1380 else
1382 return true;
1383 }
1384 case Intrinsic::amdgcn_global_load_lds: {
1386 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1387 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1388 Info.ptrVal = CI.getArgOperand(1);
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1394
1395 const GCNTargetMachine &TM =
1396 static_cast<const GCNTargetMachine &>(getTargetMachine());
1397
1399 Info.ptrVal = MFI->getGWSPSV(TM);
1400
1401 // This is an abstract access, but we need to specify a type and size.
1402 Info.memVT = MVT::i32;
1403 Info.size = 4;
1404 Info.align = Align(4);
1405
1407 return true;
1408 }
1409 default:
1410 return false;
1411 }
1412}
1413
1416 Type *&AccessTy) const {
1417 switch (II->getIntrinsicID()) {
1418 case Intrinsic::amdgcn_global_load_tr:
1419 case Intrinsic::amdgcn_ds_ordered_add:
1420 case Intrinsic::amdgcn_ds_ordered_swap:
1421 case Intrinsic::amdgcn_ds_append:
1422 case Intrinsic::amdgcn_ds_consume:
1423 case Intrinsic::amdgcn_ds_fadd:
1424 case Intrinsic::amdgcn_ds_fmin:
1425 case Intrinsic::amdgcn_ds_fmax:
1426 case Intrinsic::amdgcn_global_atomic_fadd:
1427 case Intrinsic::amdgcn_flat_atomic_fadd:
1428 case Intrinsic::amdgcn_flat_atomic_fmin:
1429 case Intrinsic::amdgcn_flat_atomic_fmax:
1430 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1431 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1432 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1433 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1434 case Intrinsic::amdgcn_global_atomic_csub: {
1435 Value *Ptr = II->getArgOperand(0);
1436 AccessTy = II->getType();
1437 Ops.push_back(Ptr);
1438 return true;
1439 }
1440 default:
1441 return false;
1442 }
1443}
1444
1445bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1446 unsigned AddrSpace,
1447 uint64_t FlatVariant) const {
1448 if (!Subtarget->hasFlatInstOffsets()) {
1449 // Flat instructions do not have offsets, and only have the register
1450 // address.
1451 return AM.BaseOffs == 0 && AM.Scale == 0;
1452 }
1453
1454 return AM.Scale == 0 &&
1455 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1456 AM.BaseOffs, AddrSpace, FlatVariant));
1457}
1458
1460 if (Subtarget->hasFlatGlobalInsts())
1461 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1463
1464 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1465 // Assume the we will use FLAT for all global memory accesses
1466 // on VI.
1467 // FIXME: This assumption is currently wrong. On VI we still use
1468 // MUBUF instructions for the r + i addressing mode. As currently
1469 // implemented, the MUBUF instructions only work on buffer < 4GB.
1470 // It may be possible to support > 4GB buffers with MUBUF instructions,
1471 // by setting the stride value in the resource descriptor which would
1472 // increase the size limit to (stride * 4GB). However, this is risky,
1473 // because it has never been validated.
1474 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1476 }
1477
1478 return isLegalMUBUFAddressingMode(AM);
1479}
1480
1481bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1482 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1483 // additionally can do r + r + i with addr64. 32-bit has more addressing
1484 // mode options. Depending on the resource constant, it can also do
1485 // (i64 r0) + (i32 r1) * (i14 i).
1486 //
1487 // Private arrays end up using a scratch buffer most of the time, so also
1488 // assume those use MUBUF instructions. Scratch loads / stores are currently
1489 // implemented as mubuf instructions with offen bit set, so slightly
1490 // different than the normal addr64.
1491 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1492 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1493 return false;
1494
1495 // FIXME: Since we can split immediate into soffset and immediate offset,
1496 // would it make sense to allow any immediate?
1497
1498 switch (AM.Scale) {
1499 case 0: // r + i or just i, depending on HasBaseReg.
1500 return true;
1501 case 1:
1502 return true; // We have r + r or r + i.
1503 case 2:
1504 if (AM.HasBaseReg) {
1505 // Reject 2 * r + r.
1506 return false;
1507 }
1508
1509 // Allow 2 * r as r + r
1510 // Or 2 * r + i is allowed as r + r + i.
1511 return true;
1512 default: // Don't allow n * r
1513 return false;
1514 }
1515}
1516
1518 const AddrMode &AM, Type *Ty,
1519 unsigned AS, Instruction *I) const {
1520 // No global is ever allowed as a base.
1521 if (AM.BaseGV)
1522 return false;
1523
1524 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1525 return isLegalGlobalAddressingMode(AM);
1526
1527 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1531 // If the offset isn't a multiple of 4, it probably isn't going to be
1532 // correctly aligned.
1533 // FIXME: Can we get the real alignment here?
1534 if (AM.BaseOffs % 4 != 0)
1535 return isLegalMUBUFAddressingMode(AM);
1536
1537 // There are no SMRD extloads, so if we have to do a small type access we
1538 // will use a MUBUF load.
1539 // FIXME?: We also need to do this if unaligned, but we don't know the
1540 // alignment here.
1541 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
1542 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1543 return isLegalGlobalAddressingMode(AM);
1544
1546 // SMRD instructions have an 8-bit, dword offset on SI.
1547 if (!isUInt<8>(AM.BaseOffs / 4))
1548 return false;
1549 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1550 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1551 // in 8-bits, it can use a smaller encoding.
1552 if (!isUInt<32>(AM.BaseOffs / 4))
1553 return false;
1554 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1555 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1556 if (!isUInt<20>(AM.BaseOffs))
1557 return false;
1558 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1559 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1560 // for S_BUFFER_* instructions).
1561 if (!isInt<21>(AM.BaseOffs))
1562 return false;
1563 } else {
1564 // On GFX12, all offsets are signed 24-bit in bytes.
1565 if (!isInt<24>(AM.BaseOffs))
1566 return false;
1567 }
1568
1569 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1570 return true;
1571
1572 if (AM.Scale == 1 && AM.HasBaseReg)
1573 return true;
1574
1575 return false;
1576 }
1577
1578 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1579 return Subtarget->enableFlatScratch()
1580 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1582 : isLegalMUBUFAddressingMode(AM);
1583
1584 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1585 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1586 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1587 // field.
1588 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1589 // an 8-bit dword offset but we don't know the alignment here.
1590 if (!isUInt<16>(AM.BaseOffs))
1591 return false;
1592
1593 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1594 return true;
1595
1596 if (AM.Scale == 1 && AM.HasBaseReg)
1597 return true;
1598
1599 return false;
1600 }
1601
1603 // For an unknown address space, this usually means that this is for some
1604 // reason being used for pure arithmetic, and not based on some addressing
1605 // computation. We don't have instructions that compute pointers with any
1606 // addressing modes, so treat them as having no offset like flat
1607 // instructions.
1608 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1610 }
1611
1612 // Assume a user alias of global for unknown address spaces.
1613 return isLegalGlobalAddressingMode(AM);
1614}
1615
1617 const MachineFunction &MF) const {
1619 return (MemVT.getSizeInBits() <= 4 * 32);
1620 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1621 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1622 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1623 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1624 return (MemVT.getSizeInBits() <= 2 * 32);
1625 }
1626 return true;
1627}
1628
1630 unsigned Size, unsigned AddrSpace, Align Alignment,
1631 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1632 if (IsFast)
1633 *IsFast = 0;
1634
1635 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1636 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1637 // Check if alignment requirements for ds_read/write instructions are
1638 // disabled.
1639 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1640 return false;
1641
1642 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1643 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1644 Alignment < RequiredAlignment)
1645 return false;
1646
1647 // Either, the alignment requirements are "enabled", or there is an
1648 // unaligned LDS access related hardware bug though alignment requirements
1649 // are "disabled". In either case, we need to check for proper alignment
1650 // requirements.
1651 //
1652 switch (Size) {
1653 case 64:
1654 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1655 // address is negative, then the instruction is incorrectly treated as
1656 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1657 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1658 // load later in the SILoadStoreOptimizer.
1659 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1660 return false;
1661
1662 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1663 // can do a 4 byte aligned, 8 byte access in a single operation using
1664 // ds_read2/write2_b32 with adjacent offsets.
1665 RequiredAlignment = Align(4);
1666
1667 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1668 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1669 // ds_write2_b32 depending on the alignment. In either case with either
1670 // alignment there is no faster way of doing this.
1671
1672 // The numbers returned here and below are not additive, it is a 'speed
1673 // rank'. They are just meant to be compared to decide if a certain way
1674 // of lowering an operation is faster than another. For that purpose
1675 // naturally aligned operation gets it bitsize to indicate that "it
1676 // operates with a speed comparable to N-bit wide load". With the full
1677 // alignment ds128 is slower than ds96 for example. If underaligned it
1678 // is comparable to a speed of a single dword access, which would then
1679 // mean 32 < 128 and it is faster to issue a wide load regardless.
1680 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1681 // wider load which will not be aligned anymore the latter is slower.
1682 if (IsFast)
1683 *IsFast = (Alignment >= RequiredAlignment) ? 64
1684 : (Alignment < Align(4)) ? 32
1685 : 1;
1686 return true;
1687 }
1688
1689 break;
1690 case 96:
1691 if (!Subtarget->hasDS96AndDS128())
1692 return false;
1693
1694 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1695 // gfx8 and older.
1696
1697 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1698 // Naturally aligned access is fastest. However, also report it is Fast
1699 // if memory is aligned less than DWORD. A narrow load or store will be
1700 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1701 // be more of them, so overall we will pay less penalty issuing a single
1702 // instruction.
1703
1704 // See comment on the values above.
1705 if (IsFast)
1706 *IsFast = (Alignment >= RequiredAlignment) ? 96
1707 : (Alignment < Align(4)) ? 32
1708 : 1;
1709 return true;
1710 }
1711
1712 break;
1713 case 128:
1714 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1715 return false;
1716
1717 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1718 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1719 // single operation using ds_read2/write2_b64.
1720 RequiredAlignment = Align(8);
1721
1722 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1723 // Naturally aligned access is fastest. However, also report it is Fast
1724 // if memory is aligned less than DWORD. A narrow load or store will be
1725 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1726 // will be more of them, so overall we will pay less penalty issuing a
1727 // single instruction.
1728
1729 // See comment on the values above.
1730 if (IsFast)
1731 *IsFast = (Alignment >= RequiredAlignment) ? 128
1732 : (Alignment < Align(4)) ? 32
1733 : 1;
1734 return true;
1735 }
1736
1737 break;
1738 default:
1739 if (Size > 32)
1740 return false;
1741
1742 break;
1743 }
1744
1745 // See comment on the values above.
1746 // Note that we have a single-dword or sub-dword here, so if underaligned
1747 // it is a slowest possible access, hence returned value is 0.
1748 if (IsFast)
1749 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1750
1751 return Alignment >= RequiredAlignment ||
1752 Subtarget->hasUnalignedDSAccessEnabled();
1753 }
1754
1755 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1756 bool AlignedBy4 = Alignment >= Align(4);
1757 if (IsFast)
1758 *IsFast = AlignedBy4;
1759
1760 return AlignedBy4 ||
1761 Subtarget->enableFlatScratch() ||
1762 Subtarget->hasUnalignedScratchAccess();
1763 }
1764
1765 // FIXME: We have to be conservative here and assume that flat operations
1766 // will access scratch. If we had access to the IR function, then we
1767 // could determine if any private memory was used in the function.
1768 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1769 !Subtarget->hasUnalignedScratchAccess()) {
1770 bool AlignedBy4 = Alignment >= Align(4);
1771 if (IsFast)
1772 *IsFast = AlignedBy4;
1773
1774 return AlignedBy4;
1775 }
1776
1777 // So long as they are correct, wide global memory operations perform better
1778 // than multiple smaller memory ops -- even when misaligned
1779 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1780 if (IsFast)
1781 *IsFast = Size;
1782
1783 return Alignment >= Align(4) ||
1785 }
1786
1787 // Smaller than dword value must be aligned.
1788 if (Size < 32)
1789 return false;
1790
1791 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1792 // byte-address are ignored, thus forcing Dword alignment.
1793 // This applies to private, global, and constant memory.
1794 if (IsFast)
1795 *IsFast = 1;
1796
1797 return Size >= 32 && Alignment >= Align(4);
1798}
1799
1801 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1802 unsigned *IsFast) const {
1804 Alignment, Flags, IsFast);
1805}
1806
1808 const MemOp &Op, const AttributeList &FuncAttributes) const {
1809 // FIXME: Should account for address space here.
1810
1811 // The default fallback uses the private pointer size as a guess for a type to
1812 // use. Make sure we switch these to 64-bit accesses.
1813
1814 if (Op.size() >= 16 &&
1815 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1816 return MVT::v4i32;
1817
1818 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1819 return MVT::v2i32;
1820
1821 // Use the default.
1822 return MVT::Other;
1823}
1824
1826 const MemSDNode *MemNode = cast<MemSDNode>(N);
1827 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1828}
1829
1831 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1833}
1834
1836 unsigned DestAS) const {
1837 // Flat -> private/local is a simple truncate.
1838 // Flat -> global is no-op
1839 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1840 return true;
1841
1842 const GCNTargetMachine &TM =
1843 static_cast<const GCNTargetMachine &>(getTargetMachine());
1844 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1845}
1846
1848 const MemSDNode *MemNode = cast<MemSDNode>(N);
1849
1851}
1852
1855 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1856 VT.getScalarType().bitsLE(MVT::i16))
1859}
1860
1862 Type *Ty) const {
1863 // FIXME: Could be smarter if called for vector constants.
1864 return true;
1865}
1866
1868 unsigned Index) const {
1870 return false;
1871
1872 // TODO: Add more cases that are cheap.
1873 return Index == 0;
1874}
1875
1877 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1878 switch (Op) {
1879 case ISD::LOAD:
1880 case ISD::STORE:
1881
1882 // These operations are done with 32-bit instructions anyway.
1883 case ISD::AND:
1884 case ISD::OR:
1885 case ISD::XOR:
1886 case ISD::SELECT:
1887 // TODO: Extensions?
1888 return true;
1889 default:
1890 return false;
1891 }
1892 }
1893
1894 // SimplifySetCC uses this function to determine whether or not it should
1895 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1896 if (VT == MVT::i1 && Op == ISD::SETCC)
1897 return false;
1898
1900}
1901
1902SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1903 const SDLoc &SL,
1904 SDValue Chain,
1905 uint64_t Offset) const {
1906 const DataLayout &DL = DAG.getDataLayout();
1909
1910 const ArgDescriptor *InputPtrReg;
1911 const TargetRegisterClass *RC;
1912 LLT ArgTy;
1914
1915 std::tie(InputPtrReg, RC, ArgTy) =
1917
1918 // We may not have the kernarg segment argument if we have no kernel
1919 // arguments.
1920 if (!InputPtrReg)
1921 return DAG.getConstant(Offset, SL, PtrVT);
1922
1924 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1925 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1926
1927 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1928}
1929
1930SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1931 const SDLoc &SL) const {
1934 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1935}
1936
1937SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1938 const SDLoc &SL) const {
1939
1941 std::optional<uint32_t> KnownSize =
1943 if (KnownSize.has_value())
1944 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1945 return SDValue();
1946}
1947
1948SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1949 const SDLoc &SL, SDValue Val,
1950 bool Signed,
1951 const ISD::InputArg *Arg) const {
1952 // First, if it is a widened vector, narrow it.
1953 if (VT.isVector() &&
1955 EVT NarrowedVT =
1958 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1959 DAG.getConstant(0, SL, MVT::i32));
1960 }
1961
1962 // Then convert the vector elements or scalar value.
1963 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1964 VT.bitsLT(MemVT)) {
1965 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1966 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1967 }
1968
1969 if (MemVT.isFloatingPoint())
1970 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1971 else if (Signed)
1972 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1973 else
1974 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1975
1976 return Val;
1977}
1978
1979SDValue SITargetLowering::lowerKernargMemParameter(
1980 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1981 uint64_t Offset, Align Alignment, bool Signed,
1982 const ISD::InputArg *Arg) const {
1984
1985 // Try to avoid using an extload by loading earlier than the argument address,
1986 // and extracting the relevant bits. The load should hopefully be merged with
1987 // the previous argument.
1988 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1989 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1990 int64_t AlignDownOffset = alignDown(Offset, 4);
1991 int64_t OffsetDiff = Offset - AlignDownOffset;
1992
1993 EVT IntVT = MemVT.changeTypeToInteger();
1994
1995 // TODO: If we passed in the base kernel offset we could have a better
1996 // alignment than 4, but we don't really need it.
1997 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1998 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2001
2002 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2003 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2004
2005 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2006 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2007 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2008
2009
2010 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2011 }
2012
2013 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2014 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2017
2018 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2019 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2020}
2021
2022SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2023 const SDLoc &SL, SDValue Chain,
2024 const ISD::InputArg &Arg) const {
2026 MachineFrameInfo &MFI = MF.getFrameInfo();
2027
2028 if (Arg.Flags.isByVal()) {
2029 unsigned Size = Arg.Flags.getByValSize();
2030 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2031 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2032 }
2033
2034 unsigned ArgOffset = VA.getLocMemOffset();
2035 unsigned ArgSize = VA.getValVT().getStoreSize();
2036
2037 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2038
2039 // Create load nodes to retrieve arguments from the stack.
2040 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2041 SDValue ArgValue;
2042
2043 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2045 MVT MemVT = VA.getValVT();
2046
2047 switch (VA.getLocInfo()) {
2048 default:
2049 break;
2050 case CCValAssign::BCvt:
2051 MemVT = VA.getLocVT();
2052 break;
2053 case CCValAssign::SExt:
2054 ExtType = ISD::SEXTLOAD;
2055 break;
2056 case CCValAssign::ZExt:
2057 ExtType = ISD::ZEXTLOAD;
2058 break;
2059 case CCValAssign::AExt:
2060 ExtType = ISD::EXTLOAD;
2061 break;
2062 }
2063
2064 ArgValue = DAG.getExtLoad(
2065 ExtType, SL, VA.getLocVT(), Chain, FIN,
2067 MemVT);
2068 return ArgValue;
2069}
2070
2071SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2072 const SIMachineFunctionInfo &MFI,
2073 EVT VT,
2075 const ArgDescriptor *Reg = nullptr;
2076 const TargetRegisterClass *RC;
2077 LLT Ty;
2078
2080 const ArgDescriptor WorkGroupIDX =
2081 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2082 // If GridZ is not programmed in an entry function then the hardware will set
2083 // it to all zeros, so there is no need to mask the GridY value in the low
2084 // order bits.
2085 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2086 AMDGPU::TTMP7,
2087 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2088 const ArgDescriptor WorkGroupIDZ =
2089 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2090 if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2091 switch (PVID) {
2093 Reg = &WorkGroupIDX;
2094 RC = &AMDGPU::SReg_32RegClass;
2095 Ty = LLT::scalar(32);
2096 break;
2098 Reg = &WorkGroupIDY;
2099 RC = &AMDGPU::SReg_32RegClass;
2100 Ty = LLT::scalar(32);
2101 break;
2103 Reg = &WorkGroupIDZ;
2104 RC = &AMDGPU::SReg_32RegClass;
2105 Ty = LLT::scalar(32);
2106 break;
2107 default:
2108 break;
2109 }
2110 }
2111
2112 if (!Reg)
2113 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2114 if (!Reg) {
2116 // It's possible for a kernarg intrinsic call to appear in a kernel with
2117 // no allocated segment, in which case we do not add the user sgpr
2118 // argument, so just return null.
2119 return DAG.getConstant(0, SDLoc(), VT);
2120 }
2121
2122 // It's undefined behavior if a function marked with the amdgpu-no-*
2123 // attributes uses the corresponding intrinsic.
2124 return DAG.getUNDEF(VT);
2125 }
2126
2127 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2128}
2129
2131 CallingConv::ID CallConv,
2132 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2133 FunctionType *FType,
2134 SIMachineFunctionInfo *Info) {
2135 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2136 const ISD::InputArg *Arg = &Ins[I];
2137
2138 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2139 "vector type argument should have been split");
2140
2141 // First check if it's a PS input addr.
2142 if (CallConv == CallingConv::AMDGPU_PS &&
2143 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2144 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2145
2146 // Inconveniently only the first part of the split is marked as isSplit,
2147 // so skip to the end. We only want to increment PSInputNum once for the
2148 // entire split argument.
2149 if (Arg->Flags.isSplit()) {
2150 while (!Arg->Flags.isSplitEnd()) {
2151 assert((!Arg->VT.isVector() ||
2152 Arg->VT.getScalarSizeInBits() == 16) &&
2153 "unexpected vector split in ps argument type");
2154 if (!SkipArg)
2155 Splits.push_back(*Arg);
2156 Arg = &Ins[++I];
2157 }
2158 }
2159
2160 if (SkipArg) {
2161 // We can safely skip PS inputs.
2162 Skipped.set(Arg->getOrigArgIndex());
2163 ++PSInputNum;
2164 continue;
2165 }
2166
2167 Info->markPSInputAllocated(PSInputNum);
2168 if (Arg->Used)
2169 Info->markPSInputEnabled(PSInputNum);
2170
2171 ++PSInputNum;
2172 }
2173
2174 Splits.push_back(*Arg);
2175 }
2176}
2177
2178// Allocate special inputs passed in VGPRs.
2180 MachineFunction &MF,
2181 const SIRegisterInfo &TRI,
2182 SIMachineFunctionInfo &Info) const {
2183 const LLT S32 = LLT::scalar(32);
2185
2186 if (Info.hasWorkItemIDX()) {
2187 Register Reg = AMDGPU::VGPR0;
2188 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2189
2190 CCInfo.AllocateReg(Reg);
2191 unsigned Mask = (Subtarget->hasPackedTID() &&
2192 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2193 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2194 }
2195
2196 if (Info.hasWorkItemIDY()) {
2197 assert(Info.hasWorkItemIDX());
2198 if (Subtarget->hasPackedTID()) {
2199 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2200 0x3ff << 10));
2201 } else {
2202 unsigned Reg = AMDGPU::VGPR1;
2203 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2204
2205 CCInfo.AllocateReg(Reg);
2206 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2207 }
2208 }
2209
2210 if (Info.hasWorkItemIDZ()) {
2211 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2212 if (Subtarget->hasPackedTID()) {
2213 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2214 0x3ff << 20));
2215 } else {
2216 unsigned Reg = AMDGPU::VGPR2;
2217 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2218
2219 CCInfo.AllocateReg(Reg);
2220 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2221 }
2222 }
2223}
2224
2225// Try to allocate a VGPR at the end of the argument list, or if no argument
2226// VGPRs are left allocating a stack slot.
2227// If \p Mask is is given it indicates bitfield position in the register.
2228// If \p Arg is given use it with new ]p Mask instead of allocating new.
2229static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2230 ArgDescriptor Arg = ArgDescriptor()) {
2231 if (Arg.isSet())
2232 return ArgDescriptor::createArg(Arg, Mask);
2233
2234 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2235 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2236 if (RegIdx == ArgVGPRs.size()) {
2237 // Spill to stack required.
2238 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2239
2240 return ArgDescriptor::createStack(Offset, Mask);
2241 }
2242
2243 unsigned Reg = ArgVGPRs[RegIdx];
2244 Reg = CCInfo.AllocateReg(Reg);
2245 assert(Reg != AMDGPU::NoRegister);
2246
2247 MachineFunction &MF = CCInfo.getMachineFunction();
2248 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2249 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2250 return ArgDescriptor::createRegister(Reg, Mask);
2251}
2252
2254 const TargetRegisterClass *RC,
2255 unsigned NumArgRegs) {
2256 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2257 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2258 if (RegIdx == ArgSGPRs.size())
2259 report_fatal_error("ran out of SGPRs for arguments");
2260
2261 unsigned Reg = ArgSGPRs[RegIdx];
2262 Reg = CCInfo.AllocateReg(Reg);
2263 assert(Reg != AMDGPU::NoRegister);
2264
2265 MachineFunction &MF = CCInfo.getMachineFunction();
2266 MF.addLiveIn(Reg, RC);
2268}
2269
2270// If this has a fixed position, we still should allocate the register in the
2271// CCInfo state. Technically we could get away with this for values passed
2272// outside of the normal argument range.
2274 const TargetRegisterClass *RC,
2275 MCRegister Reg) {
2276 Reg = CCInfo.AllocateReg(Reg);
2277 assert(Reg != AMDGPU::NoRegister);
2278 MachineFunction &MF = CCInfo.getMachineFunction();
2279 MF.addLiveIn(Reg, RC);
2280}
2281
2282static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2283 if (Arg) {
2284 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2285 Arg.getRegister());
2286 } else
2287 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2288}
2289
2290static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2291 if (Arg) {
2292 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2293 Arg.getRegister());
2294 } else
2295 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2296}
2297
2298/// Allocate implicit function VGPR arguments at the end of allocated user
2299/// arguments.
2301 CCState &CCInfo, MachineFunction &MF,
2302 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2303 const unsigned Mask = 0x3ff;
2304 ArgDescriptor Arg;
2305
2306 if (Info.hasWorkItemIDX()) {
2307 Arg = allocateVGPR32Input(CCInfo, Mask);
2308 Info.setWorkItemIDX(Arg);
2309 }
2310
2311 if (Info.hasWorkItemIDY()) {
2312 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2313 Info.setWorkItemIDY(Arg);
2314 }
2315
2316 if (Info.hasWorkItemIDZ())
2317 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2318}
2319
2320/// Allocate implicit function VGPR arguments in fixed registers.
2322 CCState &CCInfo, MachineFunction &MF,
2323 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2324 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2325 if (!Reg)
2326 report_fatal_error("failed to allocated VGPR for implicit arguments");
2327
2328 const unsigned Mask = 0x3ff;
2329 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2330 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2331 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2332}
2333
2335 CCState &CCInfo,
2336 MachineFunction &MF,
2337 const SIRegisterInfo &TRI,
2338 SIMachineFunctionInfo &Info) const {
2339 auto &ArgInfo = Info.getArgInfo();
2340 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2341
2342 // TODO: Unify handling with private memory pointers.
2343 if (UserSGPRInfo.hasDispatchPtr())
2344 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2345
2346 const Module *M = MF.getFunction().getParent();
2347 if (UserSGPRInfo.hasQueuePtr() &&
2349 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2350
2351 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2352 // constant offset from the kernarg segment.
2353 if (Info.hasImplicitArgPtr())
2354 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2355
2356 if (UserSGPRInfo.hasDispatchID())
2357 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2358
2359 // flat_scratch_init is not applicable for non-kernel functions.
2360
2361 if (Info.hasWorkGroupIDX())
2362 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2363
2364 if (Info.hasWorkGroupIDY())
2365 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2366
2367 if (Info.hasWorkGroupIDZ())
2368 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2369
2370 if (Info.hasLDSKernelId())
2371 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2372}
2373
2374// Allocate special inputs passed in user SGPRs.
2376 MachineFunction &MF,
2377 const SIRegisterInfo &TRI,
2378 SIMachineFunctionInfo &Info) const {
2379 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2380 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2381 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2382 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2383 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2384 }
2385
2386 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2387 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2388 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2389 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2390 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2391 }
2392
2393 if (UserSGPRInfo.hasDispatchPtr()) {
2394 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2395 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2396 CCInfo.AllocateReg(DispatchPtrReg);
2397 }
2398
2399 const Module *M = MF.getFunction().getParent();
2400 if (UserSGPRInfo.hasQueuePtr() &&
2402 Register QueuePtrReg = Info.addQueuePtr(TRI);
2403 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2404 CCInfo.AllocateReg(QueuePtrReg);
2405 }
2406
2407 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2409 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2410 CCInfo.AllocateReg(InputPtrReg);
2411
2412 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2413 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2414 }
2415
2416 if (UserSGPRInfo.hasDispatchID()) {
2417 Register DispatchIDReg = Info.addDispatchID(TRI);
2418 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2419 CCInfo.AllocateReg(DispatchIDReg);
2420 }
2421
2422 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2423 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2424 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2425 CCInfo.AllocateReg(FlatScratchInitReg);
2426 }
2427
2428 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2429 // these from the dispatch pointer.
2430}
2431
2432// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2433// sequential starting from the first argument.
2435 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2437 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2438 Function &F = MF.getFunction();
2439 unsigned LastExplicitArgOffset =
2440 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2441 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2442 bool InPreloadSequence = true;
2443 unsigned InIdx = 0;
2444 for (auto &Arg : F.args()) {
2445 if (!InPreloadSequence || !Arg.hasInRegAttr())
2446 break;
2447
2448 int ArgIdx = Arg.getArgNo();
2449 // Don't preload non-original args or parts not in the current preload
2450 // sequence.
2451 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2452 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2453 break;
2454
2455 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2456 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2457 InIdx++) {
2458 assert(ArgLocs[ArgIdx].isMemLoc());
2459 auto &ArgLoc = ArgLocs[InIdx];
2460 const Align KernelArgBaseAlign = Align(16);
2461 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2462 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2463 unsigned NumAllocSGPRs =
2464 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2465
2466 // Arg is preloaded into the previous SGPR.
2467 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2468 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2469 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2470 continue;
2471 }
2472
2473 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2474 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2475 // Check for free user SGPRs for preloading.
2476 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2477 SGPRInfo.getNumFreeUserSGPRs()) {
2478 InPreloadSequence = false;
2479 break;
2480 }
2481
2482 // Preload this argument.
2483 const TargetRegisterClass *RC =
2484 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2485 SmallVectorImpl<MCRegister> *PreloadRegs =
2486 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2487
2488 if (PreloadRegs->size() > 1)
2489 RC = &AMDGPU::SGPR_32RegClass;
2490 for (auto &Reg : *PreloadRegs) {
2491 assert(Reg);
2492 MF.addLiveIn(Reg, RC);
2493 CCInfo.AllocateReg(Reg);
2494 }
2495
2496 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2497 }
2498 }
2499}
2500
2502 const SIRegisterInfo &TRI,
2503 SIMachineFunctionInfo &Info) const {
2504 // Always allocate this last since it is a synthetic preload.
2505 if (Info.hasLDSKernelId()) {
2506 Register Reg = Info.addLDSKernelId();
2507 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2508 CCInfo.AllocateReg(Reg);
2509 }
2510}
2511
2512// Allocate special input registers that are initialized per-wave.
2514 MachineFunction &MF,
2516 CallingConv::ID CallConv,
2517 bool IsShader) const {
2518 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2519 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2520 // Note: user SGPRs are handled by the front-end for graphics shaders
2521 // Pad up the used user SGPRs with dead inputs.
2522
2523 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2524 // before enabling architected SGPRs for workgroup IDs.
2525 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2526
2527 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2528 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2529 // rely on it to reach 16 since if we end up having no stack usage, it will
2530 // not really be added.
2531 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2532 Info.hasWorkGroupIDY() +
2533 Info.hasWorkGroupIDZ() +
2534 Info.hasWorkGroupInfo();
2535 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2536 Register Reg = Info.addReservedUserSGPR();
2537 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2538 CCInfo.AllocateReg(Reg);
2539 }
2540 }
2541
2542 if (!HasArchitectedSGPRs) {
2543 if (Info.hasWorkGroupIDX()) {
2544 Register Reg = Info.addWorkGroupIDX();
2545 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2546 CCInfo.AllocateReg(Reg);
2547 }
2548
2549 if (Info.hasWorkGroupIDY()) {
2550 Register Reg = Info.addWorkGroupIDY();
2551 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2552 CCInfo.AllocateReg(Reg);
2553 }
2554
2555 if (Info.hasWorkGroupIDZ()) {
2556 Register Reg = Info.addWorkGroupIDZ();
2557 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2558 CCInfo.AllocateReg(Reg);
2559 }
2560 }
2561
2562 if (Info.hasWorkGroupInfo()) {
2563 Register Reg = Info.addWorkGroupInfo();
2564 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2565 CCInfo.AllocateReg(Reg);
2566 }
2567
2568 if (Info.hasPrivateSegmentWaveByteOffset()) {
2569 // Scratch wave offset passed in system SGPR.
2570 unsigned PrivateSegmentWaveByteOffsetReg;
2571
2572 if (IsShader) {
2573 PrivateSegmentWaveByteOffsetReg =
2574 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2575
2576 // This is true if the scratch wave byte offset doesn't have a fixed
2577 // location.
2578 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2579 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2580 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2581 }
2582 } else
2583 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2584
2585 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2586 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2587 }
2588
2589 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2590 Info.getNumPreloadedSGPRs() >= 16);
2591}
2592
2594 MachineFunction &MF,
2595 const SIRegisterInfo &TRI,
2596 SIMachineFunctionInfo &Info) {
2597 // Now that we've figured out where the scratch register inputs are, see if
2598 // should reserve the arguments and use them directly.
2599 MachineFrameInfo &MFI = MF.getFrameInfo();
2600 bool HasStackObjects = MFI.hasStackObjects();
2601 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2602
2603 // Record that we know we have non-spill stack objects so we don't need to
2604 // check all stack objects later.
2605 if (HasStackObjects)
2606 Info.setHasNonSpillStackObjects(true);
2607
2608 // Everything live out of a block is spilled with fast regalloc, so it's
2609 // almost certain that spilling will be required.
2610 if (TM.getOptLevel() == CodeGenOptLevel::None)
2611 HasStackObjects = true;
2612
2613 // For now assume stack access is needed in any callee functions, so we need
2614 // the scratch registers to pass in.
2615 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2616
2617 if (!ST.enableFlatScratch()) {
2618 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2619 // If we have stack objects, we unquestionably need the private buffer
2620 // resource. For the Code Object V2 ABI, this will be the first 4 user
2621 // SGPR inputs. We can reserve those and use them directly.
2622
2623 Register PrivateSegmentBufferReg =
2625 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2626 } else {
2627 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2628 // We tentatively reserve the last registers (skipping the last registers
2629 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2630 // we'll replace these with the ones immediately after those which were
2631 // really allocated. In the prologue copies will be inserted from the
2632 // argument to these reserved registers.
2633
2634 // Without HSA, relocations are used for the scratch pointer and the
2635 // buffer resource setup is always inserted in the prologue. Scratch wave
2636 // offset is still in an input SGPR.
2637 Info.setScratchRSrcReg(ReservedBufferReg);
2638 }
2639 }
2640
2642
2643 // For entry functions we have to set up the stack pointer if we use it,
2644 // whereas non-entry functions get this "for free". This means there is no
2645 // intrinsic advantage to using S32 over S34 in cases where we do not have
2646 // calls but do need a frame pointer (i.e. if we are requested to have one
2647 // because frame pointer elimination is disabled). To keep things simple we
2648 // only ever use S32 as the call ABI stack pointer, and so using it does not
2649 // imply we need a separate frame pointer.
2650 //
2651 // Try to use s32 as the SP, but move it if it would interfere with input
2652 // arguments. This won't work with calls though.
2653 //
2654 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2655 // registers.
2656 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2657 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2658 } else {
2660
2661 if (MFI.hasCalls())
2662 report_fatal_error("call in graphics shader with too many input SGPRs");
2663
2664 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2665 if (!MRI.isLiveIn(Reg)) {
2666 Info.setStackPtrOffsetReg(Reg);
2667 break;
2668 }
2669 }
2670
2671 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2672 report_fatal_error("failed to find register for SP");
2673 }
2674
2675 // hasFP should be accurate for entry functions even before the frame is
2676 // finalized, because it does not rely on the known stack size, only
2677 // properties like whether variable sized objects are present.
2678 if (ST.getFrameLowering()->hasFP(MF)) {
2679 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2680 }
2681}
2682
2685 return !Info->isEntryFunction();
2686}
2687
2689
2690}
2691
2693 MachineBasicBlock *Entry,
2694 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2696
2697 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2698 if (!IStart)
2699 return;
2700
2701 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2702 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2703 MachineBasicBlock::iterator MBBI = Entry->begin();
2704 for (const MCPhysReg *I = IStart; *I; ++I) {
2705 const TargetRegisterClass *RC = nullptr;
2706 if (AMDGPU::SReg_64RegClass.contains(*I))
2707 RC = &AMDGPU::SGPR_64RegClass;
2708 else if (AMDGPU::SReg_32RegClass.contains(*I))
2709 RC = &AMDGPU::SGPR_32RegClass;
2710 else
2711 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2712
2713 Register NewVR = MRI->createVirtualRegister(RC);
2714 // Create copy from CSR to a virtual register.
2715 Entry->addLiveIn(*I);
2716 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2717 .addReg(*I);
2718
2719 // Insert the copy-back instructions right before the terminator.
2720 for (auto *Exit : Exits)
2721 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2722 TII->get(TargetOpcode::COPY), *I)
2723 .addReg(NewVR);
2724 }
2725}
2726
2728 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2729 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2730 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2732
2734 const Function &Fn = MF.getFunction();
2737
2738 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2739 DiagnosticInfoUnsupported NoGraphicsHSA(
2740 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2741 DAG.getContext()->diagnose(NoGraphicsHSA);
2742 return DAG.getEntryNode();
2743 }
2744
2747 BitVector Skipped(Ins.size());
2748 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2749 *DAG.getContext());
2750
2751 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2752 bool IsKernel = AMDGPU::isKernel(CallConv);
2753 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2754
2755 if (IsGraphics) {
2756 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2757 assert(!UserSGPRInfo.hasDispatchPtr() &&
2758 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2759 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2760 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2761 (void)UserSGPRInfo;
2762 if (!Subtarget->enableFlatScratch())
2763 assert(!UserSGPRInfo.hasFlatScratchInit());
2764 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2765 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2766 !Info->hasWorkGroupIDZ());
2767 }
2768
2769 if (CallConv == CallingConv::AMDGPU_PS) {
2770 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2771
2772 // At least one interpolation mode must be enabled or else the GPU will
2773 // hang.
2774 //
2775 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2776 // set PSInputAddr, the user wants to enable some bits after the compilation
2777 // based on run-time states. Since we can't know what the final PSInputEna
2778 // will look like, so we shouldn't do anything here and the user should take
2779 // responsibility for the correct programming.
2780 //
2781 // Otherwise, the following restrictions apply:
2782 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2783 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2784 // enabled too.
2785 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2786 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2787 CCInfo.AllocateReg(AMDGPU::VGPR0);
2788 CCInfo.AllocateReg(AMDGPU::VGPR1);
2789 Info->markPSInputAllocated(0);
2790 Info->markPSInputEnabled(0);
2791 }
2792 if (Subtarget->isAmdPalOS()) {
2793 // For isAmdPalOS, the user does not enable some bits after compilation
2794 // based on run-time states; the register values being generated here are
2795 // the final ones set in hardware. Therefore we need to apply the
2796 // workaround to PSInputAddr and PSInputEnable together. (The case where
2797 // a bit is set in PSInputAddr but not PSInputEnable is where the
2798 // frontend set up an input arg for a particular interpolation mode, but
2799 // nothing uses that input arg. Really we should have an earlier pass
2800 // that removes such an arg.)
2801 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2802 if ((PsInputBits & 0x7F) == 0 ||
2803 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2804 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2805 }
2806 } else if (IsKernel) {
2807 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2808 } else {
2809 Splits.append(Ins.begin(), Ins.end());
2810 }
2811
2812 if (IsKernel)
2813 analyzeFormalArgumentsCompute(CCInfo, Ins);
2814
2815 if (IsEntryFunc) {
2816 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2817 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2818 if (IsKernel && Subtarget->hasKernargPreload() &&
2820 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2821
2822 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2823 } else if (!IsGraphics) {
2824 // For the fixed ABI, pass workitem IDs in the last argument register.
2825 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2826
2827 // FIXME: Sink this into allocateSpecialInputSGPRs
2828 if (!Subtarget->enableFlatScratch())
2829 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2830
2831 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2832 }
2833
2834 if (!IsKernel) {
2835 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2836 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2837 }
2838
2840
2841 // FIXME: This is the minimum kernel argument alignment. We should improve
2842 // this to the maximum alignment of the arguments.
2843 //
2844 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2845 // kern arg offset.
2846 const Align KernelArgBaseAlign = Align(16);
2847
2848 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2849 const ISD::InputArg &Arg = Ins[i];
2850 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2851 InVals.push_back(DAG.getUNDEF(Arg.VT));
2852 continue;
2853 }
2854
2855 CCValAssign &VA = ArgLocs[ArgIdx++];
2856 MVT VT = VA.getLocVT();
2857
2858 if (IsEntryFunc && VA.isMemLoc()) {
2859 VT = Ins[i].VT;
2860 EVT MemVT = VA.getLocVT();
2861
2862 const uint64_t Offset = VA.getLocMemOffset();
2863 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2864
2865 if (Arg.Flags.isByRef()) {
2866 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2867
2868 const GCNTargetMachine &TM =
2869 static_cast<const GCNTargetMachine &>(getTargetMachine());
2870 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2871 Arg.Flags.getPointerAddrSpace())) {
2874 }
2875
2876 InVals.push_back(Ptr);
2877 continue;
2878 }
2879
2880 SDValue NewArg;
2881 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2882 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2883 // In this case the argument is packed into the previous preload SGPR.
2884 int64_t AlignDownOffset = alignDown(Offset, 4);
2885 int64_t OffsetDiff = Offset - AlignDownOffset;
2886 EVT IntVT = MemVT.changeTypeToInteger();
2887
2891 Register Reg =
2892 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2893
2894 assert(Reg);
2895 Register VReg = MRI.getLiveInVirtReg(Reg);
2896 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2897
2898 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2899 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2900
2901 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2902 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2903 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2904 Ins[i].Flags.isSExt(), &Ins[i]);
2905
2906 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2907 } else {
2911 const SmallVectorImpl<MCRegister> &PreloadRegs =
2912 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2913
2914 SDValue Copy;
2915 if (PreloadRegs.size() == 1) {
2916 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2917 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2918 NewArg = DAG.getCopyFromReg(
2919 Chain, DL, VReg,
2921 TRI->getRegSizeInBits(*RC)));
2922
2923 } else {
2924 // If the kernarg alignment does not match the alignment of the SGPR
2925 // tuple RC that can accommodate this argument, it will be built up
2926 // via copies from from the individual SGPRs that the argument was
2927 // preloaded to.
2929 for (auto Reg : PreloadRegs) {
2930 Register VReg = MRI.getLiveInVirtReg(Reg);
2931 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2932 Elts.push_back(Copy);
2933 }
2934 NewArg =
2935 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2936 PreloadRegs.size()),
2937 DL, Elts);
2938 }
2939
2940 SDValue CMemVT;
2941 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2942 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2943 else
2944 CMemVT = DAG.getBitcast(MemVT, NewArg);
2945 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2946 Ins[i].Flags.isSExt(), &Ins[i]);
2947 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2948 }
2949 } else {
2950 NewArg =
2951 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2952 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2953 }
2954 Chains.push_back(NewArg.getValue(1));
2955
2956 auto *ParamTy =
2957 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2959 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2960 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2961 // On SI local pointers are just offsets into LDS, so they are always
2962 // less than 16-bits. On CI and newer they could potentially be
2963 // real pointers, so we can't guarantee their size.
2964 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
2965 DAG.getValueType(MVT::i16));
2966 }
2967
2968 InVals.push_back(NewArg);
2969 continue;
2970 } else if (!IsEntryFunc && VA.isMemLoc()) {
2971 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2972 InVals.push_back(Val);
2973 if (!Arg.Flags.isByVal())
2974 Chains.push_back(Val.getValue(1));
2975 continue;
2976 }
2977
2978 assert(VA.isRegLoc() && "Parameter must be in a register!");
2979
2980 Register Reg = VA.getLocReg();
2981 const TargetRegisterClass *RC = nullptr;
2982 if (AMDGPU::VGPR_32RegClass.contains(Reg))
2983 RC = &AMDGPU::VGPR_32RegClass;
2984 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2985 RC = &AMDGPU::SGPR_32RegClass;
2986 else
2987 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2988 EVT ValVT = VA.getValVT();
2989
2990 Reg = MF.addLiveIn(Reg, RC);
2991 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2992
2993 if (Arg.Flags.isSRet()) {
2994 // The return object should be reasonably addressable.
2995
2996 // FIXME: This helps when the return is a real sret. If it is a
2997 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2998 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2999 unsigned NumBits
3001 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3002 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3003 }
3004
3005 // If this is an 8 or 16-bit value, it is really passed promoted
3006 // to 32 bits. Insert an assert[sz]ext to capture this, then
3007 // truncate to the right size.
3008 switch (VA.getLocInfo()) {
3009 case CCValAssign::Full:
3010 break;
3011 case CCValAssign::BCvt:
3012 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3013 break;
3014 case CCValAssign::SExt:
3015 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3016 DAG.getValueType(ValVT));
3017 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3018 break;
3019 case CCValAssign::ZExt:
3020 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3021 DAG.getValueType(ValVT));
3022 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3023 break;
3024 case CCValAssign::AExt:
3025 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3026 break;
3027 default:
3028 llvm_unreachable("Unknown loc info!");
3029 }
3030
3031 InVals.push_back(Val);
3032 }
3033
3034 // Start adding system SGPRs.
3035 if (IsEntryFunc)
3036 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3037
3038 auto &ArgUsageInfo =
3040 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3041
3042 unsigned StackArgSize = CCInfo.getStackSize();
3043 Info->setBytesInStackArgArea(StackArgSize);
3044
3045 return Chains.empty() ? Chain :
3046 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3047}
3048
3049// TODO: If return values can't fit in registers, we should return as many as
3050// possible in registers before passing on stack.
3052 CallingConv::ID CallConv,
3053 MachineFunction &MF, bool IsVarArg,
3055 LLVMContext &Context) const {
3056 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3057 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3058 // for shaders. Vector types should be explicitly handled by CC.
3059 if (AMDGPU::isEntryFunctionCC(CallConv))
3060 return true;
3061
3063 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3064 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3065 return false;
3066
3067 // We must use the stack if return would require unavailable registers.
3068 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3069 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3070 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3071 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3072 return false;
3073
3074 return true;
3075}
3076
3077SDValue
3079 bool isVarArg,
3081 const SmallVectorImpl<SDValue> &OutVals,
3082 const SDLoc &DL, SelectionDAG &DAG) const {
3085
3086 if (AMDGPU::isKernel(CallConv)) {
3087 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3088 OutVals, DL, DAG);
3089 }
3090
3091 bool IsShader = AMDGPU::isShader(CallConv);
3092
3093 Info->setIfReturnsVoid(Outs.empty());
3094 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3095
3096 // CCValAssign - represent the assignment of the return value to a location.
3099
3100 // CCState - Info about the registers and stack slots.
3101 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3102 *DAG.getContext());
3103
3104 // Analyze outgoing return values.
3105 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3106
3107 SDValue Glue;
3109 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3110
3111 // Copy the result values into the output registers.
3112 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3113 ++I, ++RealRVLocIdx) {
3114 CCValAssign &VA = RVLocs[I];
3115 assert(VA.isRegLoc() && "Can only return in registers!");
3116 // TODO: Partially return in registers if return values don't fit.
3117 SDValue Arg = OutVals[RealRVLocIdx];
3118
3119 // Copied from other backends.
3120 switch (VA.getLocInfo()) {
3121 case CCValAssign::Full:
3122 break;
3123 case CCValAssign::BCvt:
3124 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3125 break;
3126 case CCValAssign::SExt:
3127 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3128 break;
3129 case CCValAssign::ZExt:
3130 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3131 break;
3132 case CCValAssign::AExt:
3133 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3134 break;
3135 default:
3136 llvm_unreachable("Unknown loc info!");
3137 }
3138
3139 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3140 Glue = Chain.getValue(1);
3141 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3142 }
3143
3144 // FIXME: Does sret work properly?
3145 if (!Info->isEntryFunction()) {
3146 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3147 const MCPhysReg *I =
3148 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3149 if (I) {
3150 for (; *I; ++I) {
3151 if (AMDGPU::SReg_64RegClass.contains(*I))
3152 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3153 else if (AMDGPU::SReg_32RegClass.contains(*I))
3154 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3155 else
3156 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3157 }
3158 }
3159 }
3160
3161 // Update chain and glue.
3162 RetOps[0] = Chain;
3163 if (Glue.getNode())
3164 RetOps.push_back(Glue);
3165
3166 unsigned Opc = AMDGPUISD::ENDPGM;
3167 if (!IsWaveEnd)
3169 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3170}
3171
3173 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3174 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3175 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3176 SDValue ThisVal) const {
3177 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3178
3179 // Assign locations to each value returned by this call.
3181 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3182 *DAG.getContext());
3183 CCInfo.AnalyzeCallResult(Ins, RetCC);
3184
3185 // Copy all of the result registers out of their specified physreg.
3186 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3187 CCValAssign VA = RVLocs[i];
3188 SDValue Val;
3189
3190 if (VA.isRegLoc()) {
3191 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3192 Chain = Val.getValue(1);
3193 InGlue = Val.getValue(2);
3194 } else if (VA.isMemLoc()) {
3195 report_fatal_error("TODO: return values in memory");
3196 } else
3197 llvm_unreachable("unknown argument location type");
3198
3199 switch (VA.getLocInfo()) {
3200 case CCValAssign::Full:
3201 break;
3202 case CCValAssign::BCvt:
3203 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3204 break;
3205 case CCValAssign::ZExt:
3206 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3207 DAG.getValueType(VA.getValVT()));
3208 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3209 break;
3210 case CCValAssign::SExt:
3211 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3212 DAG.getValueType(VA.getValVT()));
3213 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3214 break;
3215 case CCValAssign::AExt:
3216 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3217 break;
3218 default:
3219 llvm_unreachable("Unknown loc info!");
3220 }
3221
3222 InVals.push_back(Val);
3223 }
3224
3225 return Chain;
3226}
3227
3228// Add code to pass special inputs required depending on used features separate
3229// from the explicit user arguments present in the IR.
3231 CallLoweringInfo &CLI,
3232 CCState &CCInfo,
3233 const SIMachineFunctionInfo &Info,
3234 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3235 SmallVectorImpl<SDValue> &MemOpChains,
3236 SDValue Chain) const {
3237 // If we don't have a call site, this was a call inserted by
3238 // legalization. These can never use special inputs.
3239 if (!CLI.CB)
3240 return;
3241
3242 SelectionDAG &DAG = CLI.DAG;
3243 const SDLoc &DL = CLI.DL;
3244 const Function &F = DAG.getMachineFunction().getFunction();
3245
3246 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3247 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3248
3249 const AMDGPUFunctionArgInfo *CalleeArgInfo
3251 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3252 auto &ArgUsageInfo =
3254 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3255 }
3256
3257 // TODO: Unify with private memory register handling. This is complicated by
3258 // the fact that at least in kernels, the input argument is not necessarily
3259 // in the same location as the input.
3260 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3262 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3263 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3264 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3265 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3266 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3267 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3268 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3269 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3270 };
3271
3272 for (auto Attr : ImplicitAttrs) {
3273 const ArgDescriptor *OutgoingArg;
3274 const TargetRegisterClass *ArgRC;
3275 LLT ArgTy;
3276
3277 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3278
3279 // If the callee does not use the attribute value, skip copying the value.
3280 if (CLI.CB->hasFnAttr(Attr.second))
3281 continue;
3282
3283 std::tie(OutgoingArg, ArgRC, ArgTy) =
3284 CalleeArgInfo->getPreloadedValue(InputID);
3285 if (!OutgoingArg)
3286 continue;
3287
3288 const ArgDescriptor *IncomingArg;
3289 const TargetRegisterClass *IncomingArgRC;
3290 LLT Ty;
3291 std::tie(IncomingArg, IncomingArgRC, Ty) =
3292 CallerArgInfo.getPreloadedValue(InputID);
3293 assert(IncomingArgRC == ArgRC);
3294
3295 // All special arguments are ints for now.
3296 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3297 SDValue InputReg;
3298
3299 if (IncomingArg) {
3300 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3301 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3302 // The implicit arg ptr is special because it doesn't have a corresponding
3303 // input for kernels, and is computed from the kernarg segment pointer.
3304 InputReg = getImplicitArgPtr(DAG, DL);
3305 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3306 std::optional<uint32_t> Id =
3308 if (Id.has_value()) {
3309 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3310 } else {
3311 InputReg = DAG.getUNDEF(ArgVT);
3312 }
3313 } else {
3314 // We may have proven the input wasn't needed, although the ABI is
3315 // requiring it. We just need to allocate the register appropriately.
3316 InputReg = DAG.getUNDEF(ArgVT);
3317 }
3318
3319 if (OutgoingArg->isRegister()) {
3320 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3321 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3322 report_fatal_error("failed to allocate implicit input argument");
3323 } else {
3324 unsigned SpecialArgOffset =
3325 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3326 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3327 SpecialArgOffset);
3328 MemOpChains.push_back(ArgStore);
3329 }
3330 }
3331
3332 // Pack workitem IDs into a single register or pass it as is if already
3333 // packed.
3334 const ArgDescriptor *OutgoingArg;
3335 const TargetRegisterClass *ArgRC;
3336 LLT Ty;
3337
3338 std::tie(OutgoingArg, ArgRC, Ty) =
3340 if (!OutgoingArg)
3341 std::tie(OutgoingArg, ArgRC, Ty) =
3343 if (!OutgoingArg)
3344 std::tie(OutgoingArg, ArgRC, Ty) =
3346 if (!OutgoingArg)
3347 return;
3348
3349 const ArgDescriptor *IncomingArgX = std::get<0>(
3351 const ArgDescriptor *IncomingArgY = std::get<0>(
3353 const ArgDescriptor *IncomingArgZ = std::get<0>(
3355
3356 SDValue InputReg;
3357 SDLoc SL;
3358
3359 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3360 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3361 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3362
3363 // If incoming ids are not packed we need to pack them.
3364 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3365 NeedWorkItemIDX) {
3366 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3367 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3368 } else {
3369 InputReg = DAG.getConstant(0, DL, MVT::i32);
3370 }
3371 }
3372
3373 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3374 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3375 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3376 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3377 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3378 InputReg = InputReg.getNode() ?
3379 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3380 }
3381
3382 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3383 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3384 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3385 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3386 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3387 InputReg = InputReg.getNode() ?
3388 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3389 }
3390
3391 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3392 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3393 // We're in a situation where the outgoing function requires the workitem
3394 // ID, but the calling function does not have it (e.g a graphics function
3395 // calling a C calling convention function). This is illegal, but we need
3396 // to produce something.
3397 InputReg = DAG.getUNDEF(MVT::i32);
3398 } else {
3399 // Workitem ids are already packed, any of present incoming arguments
3400 // will carry all required fields.
3402 IncomingArgX ? *IncomingArgX :
3403 IncomingArgY ? *IncomingArgY :
3404 *IncomingArgZ, ~0u);
3405 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3406 }
3407 }
3408
3409 if (OutgoingArg->isRegister()) {
3410 if (InputReg)
3411 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3412
3413 CCInfo.AllocateReg(OutgoingArg->getRegister());
3414 } else {
3415 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3416 if (InputReg) {
3417 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3418 SpecialArgOffset);
3419 MemOpChains.push_back(ArgStore);
3420 }
3421 }
3422}
3423
3425 return CC == CallingConv::Fast;
3426}
3427
3428/// Return true if we might ever do TCO for calls with this calling convention.
3430 switch (CC) {
3431 case CallingConv::C:
3433 return true;
3434 default:
3435 return canGuaranteeTCO(CC);
3436 }
3437}
3438
3440 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3442 const SmallVectorImpl<SDValue> &OutVals,
3443 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3444 if (AMDGPU::isChainCC(CalleeCC))
3445 return true;
3446
3447 if (!mayTailCallThisCC(CalleeCC))
3448 return false;
3449
3450 // For a divergent call target, we need to do a waterfall loop over the
3451 // possible callees which precludes us from using a simple jump.
3452 if (Callee->isDivergent())
3453 return false;
3454
3456 const Function &CallerF = MF.getFunction();
3457 CallingConv::ID CallerCC = CallerF.getCallingConv();
3459 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3460
3461 // Kernels aren't callable, and don't have a live in return address so it
3462 // doesn't make sense to do a tail call with entry functions.
3463 if (!CallerPreserved)
3464 return false;
3465
3466 bool CCMatch = CallerCC == CalleeCC;
3467
3469 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3470 return true;
3471 return false;
3472 }
3473
3474 // TODO: Can we handle var args?
3475 if (IsVarArg)
3476 return false;
3477
3478 for (const Argument &Arg : CallerF.args()) {
3479 if (Arg.hasByValAttr())
3480 return false;
3481 }
3482
3483 LLVMContext &Ctx = *DAG.getContext();
3484
3485 // Check that the call results are passed in the same way.
3486 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3487 CCAssignFnForCall(CalleeCC, IsVarArg),
3488 CCAssignFnForCall(CallerCC, IsVarArg)))
3489 return false;
3490
3491 // The callee has to preserve all registers the caller needs to preserve.
3492 if (!CCMatch) {
3493 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3494 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3495 return false;
3496 }
3497
3498 // Nothing more to check if the callee is taking no arguments.
3499 if (Outs.empty())
3500 return true;
3501
3503 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3504
3505 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3506
3507 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3508 // If the stack arguments for this call do not fit into our own save area then
3509 // the call cannot be made tail.
3510 // TODO: Is this really necessary?
3511 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3512 return false;
3513
3514 const MachineRegisterInfo &MRI = MF.getRegInfo();
3515 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3516}
3517
3519 if (!CI->isTailCall())
3520 return false;
3521
3522 const Function *ParentFn = CI->getParent()->getParent();
3524 return false;
3525 return true;
3526}
3527
3528// The wave scratch offset register is used as the global base pointer.
3530 SmallVectorImpl<SDValue> &InVals) const {
3531 CallingConv::ID CallConv = CLI.CallConv;
3532 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3533
3534 SelectionDAG &DAG = CLI.DAG;
3535
3536 TargetLowering::ArgListEntry RequestedExec;
3537 if (IsChainCallConv) {
3538 // The last argument should be the value that we need to put in EXEC.
3539 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3540 // don't treat it like the rest of the arguments.
3541 RequestedExec = CLI.Args.back();
3542 assert(RequestedExec.Node && "No node for EXEC");
3543
3544 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3545 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3546
3547 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3548 CLI.Outs.pop_back();
3549 CLI.OutVals.pop_back();
3550
3551 if (RequestedExec.Ty->isIntegerTy(64)) {
3552 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3553 CLI.Outs.pop_back();
3554 CLI.OutVals.pop_back();
3555 }
3556
3557 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3558 "Haven't popped all the pieces of the EXEC mask");
3559 }
3560
3561 const SDLoc &DL = CLI.DL;
3563 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3565 SDValue Chain = CLI.Chain;
3566 SDValue Callee = CLI.Callee;
3567 bool &IsTailCall = CLI.IsTailCall;
3568 bool IsVarArg = CLI.IsVarArg;
3569 bool IsSibCall = false;
3570 bool IsThisReturn = false;
3572
3573 if (Callee.isUndef() || isNullConstant(Callee)) {
3574 if (!CLI.IsTailCall) {
3575 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3576 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3577 }
3578
3579 return Chain;
3580 }
3581
3582 if (IsVarArg) {
3583 return lowerUnhandledCall(CLI, InVals,
3584 "unsupported call to variadic function ");
3585 }
3586
3587 if (!CLI.CB)
3588 report_fatal_error("unsupported libcall legalization");
3589
3590 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3591 return lowerUnhandledCall(CLI, InVals,
3592 "unsupported required tail call to function ");
3593 }
3594
3595 if (IsTailCall) {
3597 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3598 if (!IsTailCall &&
3599 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3600 report_fatal_error("failed to perform tail call elimination on a call "
3601 "site marked musttail or on llvm.amdgcn.cs.chain");
3602 }
3603
3604 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3605
3606 // A sibling call is one where we're under the usual C ABI and not planning
3607 // to change that but can still do a tail call:
3608 if (!TailCallOpt && IsTailCall)
3609 IsSibCall = true;
3610
3611 if (IsTailCall)
3612 ++NumTailCalls;
3613 }
3614
3617 SmallVector<SDValue, 8> MemOpChains;
3618
3619 // Analyze operands of the call, assigning locations to each operand.
3621 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3622 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3623
3624 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3625 // With a fixed ABI, allocate fixed registers before user arguments.
3626 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3627 }
3628
3629 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3630
3631 // Get a count of how many bytes are to be pushed on the stack.
3632 unsigned NumBytes = CCInfo.getStackSize();
3633
3634 if (IsSibCall) {
3635 // Since we're not changing the ABI to make this a tail call, the memory
3636 // operands are already available in the caller's incoming argument space.
3637 NumBytes = 0;
3638 }
3639
3640 // FPDiff is the byte offset of the call's argument area from the callee's.
3641 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3642 // by this amount for a tail call. In a sibling call it must be 0 because the
3643 // caller will deallocate the entire stack and the callee still expects its
3644 // arguments to begin at SP+0. Completely unused for non-tail calls.
3645 int32_t FPDiff = 0;
3646 MachineFrameInfo &MFI = MF.getFrameInfo();
3647
3648 // Adjust the stack pointer for the new arguments...
3649 // These operations are automatically eliminated by the prolog/epilog pass
3650 if (!IsSibCall)
3651 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3652
3653 if (!IsSibCall || IsChainCallConv) {
3654 if (!Subtarget->enableFlatScratch()) {
3655 SmallVector<SDValue, 4> CopyFromChains;
3656
3657 // In the HSA case, this should be an identity copy.
3658 SDValue ScratchRSrcReg
3659 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3660 RegsToPass.emplace_back(IsChainCallConv
3661 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3662 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3663 ScratchRSrcReg);
3664 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3665 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3666 }
3667 }
3668
3669 MVT PtrVT = MVT::i32;
3670
3671 // Walk the register/memloc assignments, inserting copies/loads.
3672 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3673 CCValAssign &VA = ArgLocs[i];
3674 SDValue Arg = OutVals[i];
3675
3676 // Promote the value if needed.
3677 switch (VA.getLocInfo()) {
3678 case CCValAssign::Full:
3679 break;
3680 case CCValAssign::BCvt:
3681 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3682 break;
3683 case CCValAssign::ZExt:
3684 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3685 break;
3686 case CCValAssign::SExt:
3687 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3688 break;
3689 case CCValAssign::AExt:
3690 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3691 break;
3692 case CCValAssign::FPExt:
3693 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3694 break;
3695 default:
3696 llvm_unreachable("Unknown loc info!");
3697 }
3698
3699 if (VA.isRegLoc()) {
3700 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3701 } else {
3702 assert(VA.isMemLoc());
3703
3704 SDValue DstAddr;
3705 MachinePointerInfo DstInfo;
3706
3707 unsigned LocMemOffset = VA.getLocMemOffset();
3708 int32_t Offset = LocMemOffset;
3709
3710 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3711 MaybeAlign Alignment;
3712
3713 if (IsTailCall) {
3714 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3715 unsigned OpSize = Flags.isByVal() ?
3716 Flags.getByValSize() : VA.getValVT().getStoreSize();
3717
3718 // FIXME: We can have better than the minimum byval required alignment.
3719 Alignment =
3720 Flags.isByVal()
3721 ? Flags.getNonZeroByValAlign()
3722 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3723
3724 Offset = Offset + FPDiff;
3725 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3726
3727 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3728 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3729
3730 // Make sure any stack arguments overlapping with where we're storing
3731 // are loaded before this eventual operation. Otherwise they'll be
3732 // clobbered.
3733
3734 // FIXME: Why is this really necessary? This seems to just result in a
3735 // lot of code to copy the stack and write them back to the same
3736 // locations, which are supposed to be immutable?
3737 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3738 } else {
3739 // Stores to the argument stack area are relative to the stack pointer.
3740 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3741 MVT::i32);
3742 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3743 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3744 Alignment =
3745 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3746 }
3747
3748 if (Outs[i].Flags.isByVal()) {
3749 SDValue SizeNode =
3750 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3751 SDValue Cpy =
3752 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3753 Outs[i].Flags.getNonZeroByValAlign(),
3754 /*isVol = */ false, /*AlwaysInline = */ true,
3755 /*isTailCall = */ false, DstInfo,
3757
3758 MemOpChains.push_back(Cpy);
3759 } else {
3760 SDValue Store =
3761 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3762 MemOpChains.push_back(Store);
3763 }
3764 }
3765 }
3766
3767 if (!MemOpChains.empty())
3768 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3769
3770 // Build a sequence of copy-to-reg nodes chained together with token chain
3771 // and flag operands which copy the outgoing args into the appropriate regs.
3772 SDValue InGlue;
3773 for (auto &RegToPass : RegsToPass) {
3774 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3775 RegToPass.second, InGlue);
3776 InGlue = Chain.getValue(1);
3777 }
3778
3779
3780 // We don't usually want to end the call-sequence here because we would tidy
3781 // the frame up *after* the call, however in the ABI-changing tail-call case
3782 // we've carefully laid out the parameters so that when sp is reset they'll be
3783 // in the correct location.
3784 if (IsTailCall && !IsSibCall) {
3785 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3786 InGlue = Chain.getValue(1);
3787 }
3788
3789 std::vector<SDValue> Ops;
3790 Ops.push_back(Chain);
3791 Ops.push_back(Callee);
3792 // Add a redundant copy of the callee global which will not be legalized, as
3793 // we need direct access to the callee later.
3794 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3795 const GlobalValue *GV = GSD->getGlobal();
3796 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3797 } else {
3798 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3799 }
3800
3801 if (IsTailCall) {
3802 // Each tail call may have to adjust the stack by a different amount, so
3803 // this information must travel along with the operation for eventual
3804 // consumption by emitEpilogue.
3805 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3806 }
3807
3808 if (IsChainCallConv)
3809 Ops.push_back(RequestedExec.Node);
3810
3811 // Add argument registers to the end of the list so that they are known live
3812 // into the call.
3813 for (auto &RegToPass : RegsToPass) {
3814 Ops.push_back(DAG.getRegister(RegToPass.first,
3815 RegToPass.second.getValueType()));
3816 }
3817
3818 // Add a register mask operand representing the call-preserved registers.
3819 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3820 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3821 assert(Mask && "Missing call preserved mask for calling convention");
3822 Ops.push_back(DAG.getRegisterMask(Mask));
3823
3824 if (InGlue.getNode())
3825 Ops.push_back(InGlue);
3826
3827 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3828
3829 // If we're doing a tall call, use a TC_RETURN here rather than an
3830 // actual call instruction.
3831 if (IsTailCall) {
3832 MFI.setHasTailCall();
3833 unsigned OPC = AMDGPUISD::TC_RETURN;
3834 switch (CallConv) {
3837 break;
3841 break;
3842 }
3843
3844 return DAG.getNode(OPC, DL, NodeTys, Ops);
3845 }
3846
3847 // Returns a chain and a flag for retval copy to use.
3848 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3849 Chain = Call.getValue(0);
3850 InGlue = Call.getValue(1);
3851
3852 uint64_t CalleePopBytes = NumBytes;
3853 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3854 if (!Ins.empty())
3855 InGlue = Chain.getValue(1);
3856
3857 // Handle result values, copying them out of physregs into vregs that we
3858 // return.
3859 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3860 InVals, IsThisReturn,
3861 IsThisReturn ? OutVals[0] : SDValue());
3862}
3863
3864// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3865// except for applying the wave size scale to the increment amount.
3867 SDValue Op, SelectionDAG &DAG) const {
3868 const MachineFunction &MF = DAG.getMachineFunction();
3870
3871 SDLoc dl(Op);
3872 EVT VT = Op.getValueType();
3873 SDValue Tmp1 = Op;
3874 SDValue Tmp2 = Op.getValue(1);
3875 SDValue Tmp3 = Op.getOperand(2);
3876 SDValue Chain = Tmp1.getOperand(0);
3877
3878 Register SPReg = Info->getStackPtrOffsetReg();
3879
3880 // Chain the dynamic stack allocation so that it doesn't modify the stack
3881 // pointer when other instructions are using the stack.
3882 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3883
3884 SDValue Size = Tmp2.getOperand(1);
3885 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3886 Chain = SP.getValue(1);
3887 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3888 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3889 unsigned Opc =
3892
3893 SDValue ScaledSize = DAG.getNode(
3894 ISD::SHL, dl, VT, Size,
3895 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3896
3897 Align StackAlign = TFL->getStackAlign();
3898 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3899 if (Alignment && *Alignment > StackAlign) {
3900 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3901 DAG.getConstant(-(uint64_t)Alignment->value()
3902 << Subtarget->getWavefrontSizeLog2(),
3903 dl, VT));
3904 }
3905
3906 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3907 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3908
3909 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3910}
3911
3913 SelectionDAG &DAG) const {
3914 // We only handle constant sizes here to allow non-entry block, static sized
3915 // allocas. A truly dynamic value is more difficult to support because we
3916 // don't know if the size value is uniform or not. If the size isn't uniform,
3917 // we would need to do a wave reduction to get the maximum size to know how
3918 // much to increment the uniform stack pointer.
3919 SDValue Size = Op.getOperand(1);
3920 if (isa<ConstantSDNode>(Size))
3921 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3922
3924}
3925
3927 if (Op.getValueType() != MVT::i32)
3928 return Op; // Defer to cannot select error.
3929
3931 SDLoc SL(Op);
3932
3933 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3934
3935 // Convert from wave uniform to swizzled vector address. This should protect
3936 // from any edge cases where the stacksave result isn't directly used with
3937 // stackrestore.
3938 SDValue VectorAddress =
3939 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3940 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3941}
3942
3944 SelectionDAG &DAG) const {
3945 SDLoc SL(Op);
3946 assert(Op.getValueType() == MVT::i32);
3947
3948 uint32_t BothRoundHwReg =
3950 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3951
3952 SDValue IntrinID =
3953 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3954 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
3955 Op.getOperand(0), IntrinID, GetRoundBothImm);
3956
3957 // There are two rounding modes, one for f32 and one for f64/f16. We only
3958 // report in the standard value range if both are the same.
3959 //
3960 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
3961 // ties away from zero is not supported, and the other values are rotated by
3962 // 1.
3963 //
3964 // If the two rounding modes are not the same, report a target defined value.
3965
3966 // Mode register rounding mode fields:
3967 //
3968 // [1:0] Single-precision round mode.
3969 // [3:2] Double/Half-precision round mode.
3970 //
3971 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
3972 //
3973 // Hardware Spec
3974 // Toward-0 3 0
3975 // Nearest Even 0 1
3976 // +Inf 1 2
3977 // -Inf 2 3
3978 // NearestAway0 N/A 4
3979 //
3980 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
3981 // table we can index by the raw hardware mode.
3982 //
3983 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
3984
3985 SDValue BitTable =
3987
3988 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
3989 SDValue RoundModeTimesNumBits =
3990 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
3991
3992 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
3993 // knew only one mode was demanded.
3994 SDValue TableValue =
3995 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
3996 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
3997
3998 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
3999 SDValue TableEntry =
4000 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4001
4002 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4003 // if it's an extended value.
4004 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4005 SDValue IsStandardValue =
4006 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4007 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4008 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4009 TableEntry, EnumOffset);
4010
4011 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4012}
4013
4015 if (Op->isDivergent())
4016 return SDValue();
4017
4018 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4023 break;
4024 default:
4025 return SDValue();
4026 }
4027
4028 return Op;
4029}
4030
4031// Work around DAG legality rules only based on the result type.
4033 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4034 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4035 EVT SrcVT = Src.getValueType();
4036
4037 if (SrcVT.getScalarType() != MVT::bf16)
4038 return Op;
4039
4040 SDLoc SL(Op);
4041 SDValue BitCast =
4042 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4043
4044 EVT DstVT = Op.getValueType();
4045 if (IsStrict)
4046 llvm_unreachable("Need STRICT_BF16_TO_FP");
4047
4048 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4049}
4050
4052 const MachineFunction &MF) const {
4054 .Case("m0", AMDGPU::M0)
4055 .Case("exec", AMDGPU::EXEC)
4056 .Case("exec_lo", AMDGPU::EXEC_LO)
4057 .Case("exec_hi", AMDGPU::EXEC_HI)
4058 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4059 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4060 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4061 .Default(Register());
4062
4063 if (Reg == AMDGPU::NoRegister) {
4064 report_fatal_error(Twine("invalid register name \""
4065 + StringRef(RegName) + "\"."));
4066
4067 }
4068
4069 if (!Subtarget->hasFlatScrRegister() &&
4070 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4071 report_fatal_error(Twine("invalid register \""
4072 + StringRef(RegName) + "\" for subtarget."));
4073 }
4074
4075 switch (Reg) {
4076 case AMDGPU::M0:
4077 case AMDGPU::EXEC_LO:
4078 case AMDGPU::EXEC_HI:
4079 case AMDGPU::FLAT_SCR_LO:
4080 case AMDGPU::FLAT_SCR_HI:
4081 if (VT.getSizeInBits() == 32)
4082 return Reg;
4083 break;
4084 case AMDGPU::EXEC:
4085 case AMDGPU::FLAT_SCR:
4086 if (VT.getSizeInBits() == 64)
4087 return Reg;
4088 break;
4089 default:
4090 llvm_unreachable("missing register type checking");
4091 }
4092
4093 report_fatal_error(Twine("invalid type for register \""
4094 + StringRef(RegName) + "\"."));
4095}
4096
4097// If kill is not the last instruction, split the block so kill is always a
4098// proper terminator.
4101 MachineBasicBlock *BB) const {
4102 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4104 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4105 return SplitBB;
4106}
4107
4108// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4109// \p MI will be the only instruction in the loop body block. Otherwise, it will
4110// be the first instruction in the remainder block.
4111//
4112/// \returns { LoopBody, Remainder }
4113static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4117
4118 // To insert the loop we need to split the block. Move everything after this
4119 // point to a new block, and insert a new empty block between the two.
4121 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4123 ++MBBI;
4124
4125 MF->insert(MBBI, LoopBB);
4126 MF->insert(MBBI, RemainderBB);
4127
4128 LoopBB->addSuccessor(LoopBB);
4129 LoopBB->addSuccessor(RemainderBB);
4130
4131 // Move the rest of the block into a new block.
4132 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4133
4134 if (InstInLoop) {
4135 auto Next = std::next(I);
4136
4137 // Move instruction to loop body.
4138 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4139
4140 // Move the rest of the block.
4141 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4142 } else {
4143 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4144 }
4145
4146 MBB.addSuccessor(LoopBB);
4147
4148 return std::pair(LoopBB, RemainderBB);
4149}
4150
4151/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4153 MachineBasicBlock *MBB = MI.getParent();
4155 auto I = MI.getIterator();
4156 auto E = std::next(I);
4157
4158 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4159 .addImm(0);
4160
4161 MIBundleBuilder Bundler(*MBB, I, E);
4162 finalizeBundle(*MBB, Bundler.begin());
4163}
4164
4167 MachineBasicBlock *BB) const {
4168 const DebugLoc &DL = MI.getDebugLoc();
4169
4171
4172 MachineBasicBlock *LoopBB;
4173 MachineBasicBlock *RemainderBB;
4175
4176 // Apparently kill flags are only valid if the def is in the same block?
4177 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4178 Src->setIsKill(false);
4179
4180 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4181
4182 MachineBasicBlock::iterator I = LoopBB->end();
4183
4184 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
4186
4187 // Clear TRAP_STS.MEM_VIOL
4188 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4189 .addImm(0)
4190 .addImm(EncodedReg);
4191
4193
4194 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4195
4196 // Load and check TRAP_STS.MEM_VIOL
4197 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4198 .addImm(EncodedReg);
4199
4200 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4201 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4202 .addReg(Reg, RegState::Kill)
4203 .addImm(0);
4204 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4205 .addMBB(LoopBB);
4206
4207 return RemainderBB;
4208}
4209
4210// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4211// wavefront. If the value is uniform and just happens to be in a VGPR, this
4212// will only do one iteration. In the worst case, this will loop 64 times.
4213//
4214// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4217 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4218 const DebugLoc &DL, const MachineOperand &Idx,
4219 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4220 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4221 Register &SGPRIdxReg) {
4222
4223 MachineFunction *MF = OrigBB.getParent();
4224 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4225 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4227
4228 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4229 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4230 Register NewExec = MRI.createVirtualRegister(BoolRC);
4231 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4232 Register CondReg = MRI.createVirtualRegister(BoolRC);
4233
4234 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4235 .addReg(InitReg)
4236 .addMBB(&OrigBB)
4237 .addReg(ResultReg)
4238 .addMBB(&LoopBB);
4239
4240 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4241 .addReg(InitSaveExecReg)
4242 .addMBB(&OrigBB)
4243 .addReg(NewExec)
4244 .addMBB(&LoopBB);
4245
4246 // Read the next variant <- also loop target.
4247 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4248 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4249
4250 // Compare the just read M0 value to all possible Idx values.
4251 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4252 .addReg(CurrentIdxReg)
4253 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4254
4255 // Update EXEC, save the original EXEC value to VCC.
4256 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4257 : AMDGPU::S_AND_SAVEEXEC_B64),
4258 NewExec)
4259 .addReg(CondReg, RegState::Kill);
4260
4261 MRI.setSimpleHint(NewExec, CondReg);
4262
4263 if (UseGPRIdxMode) {
4264 if (Offset == 0) {
4265 SGPRIdxReg = CurrentIdxReg;
4266 } else {
4267 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4268 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4269 .addReg(CurrentIdxReg, RegState::Kill)
4270 .addImm(Offset);
4271 }
4272 } else {
4273 // Move index from VCC into M0
4274 if (Offset == 0) {
4275 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4276 .addReg(CurrentIdxReg, RegState::Kill);
4277 } else {
4278 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4279 .addReg(CurrentIdxReg, RegState::Kill)
4280 .addImm(Offset);
4281 }
4282 }
4283
4284 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4285 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4286 MachineInstr *InsertPt =
4287 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4288 : AMDGPU::S_XOR_B64_term), Exec)
4289 .addReg(Exec)
4290 .addReg(NewExec);
4291
4292 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4293 // s_cbranch_scc0?
4294
4295 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4296 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4297 .addMBB(&LoopBB);
4298
4299 return InsertPt->getIterator();
4300}
4301
4302// This has slightly sub-optimal regalloc when the source vector is killed by
4303// the read. The register allocator does not understand that the kill is
4304// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4305// subregister from it, using 1 more VGPR than necessary. This was saved when
4306// this was expanded after register allocation.
4309 unsigned InitResultReg, unsigned PhiReg, int Offset,
4310 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4312 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4313 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4315 const DebugLoc &DL = MI.getDebugLoc();
4317
4318 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4319 Register DstReg = MI.getOperand(0).getReg();
4320 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4321 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4322 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4323 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4324
4325 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4326
4327 // Save the EXEC mask
4328 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4329 .addReg(Exec);
4330
4331 MachineBasicBlock *LoopBB;
4332 MachineBasicBlock *RemainderBB;
4333 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4334
4335 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4336
4337 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4338 InitResultReg, DstReg, PhiReg, TmpExec,
4339 Offset, UseGPRIdxMode, SGPRIdxReg);
4340
4341 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4343 ++MBBI;
4344 MF->insert(MBBI, LandingPad);
4345 LoopBB->removeSuccessor(RemainderBB);
4346 LandingPad->addSuccessor(RemainderBB);
4347 LoopBB->addSuccessor(LandingPad);
4348 MachineBasicBlock::iterator First = LandingPad->begin();
4349 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4350 .addReg(SaveExec);
4351
4352 return InsPt;
4353}
4354
4355// Returns subreg index, offset
4356static std::pair<unsigned, int>
4358 const TargetRegisterClass *SuperRC,
4359 unsigned VecReg,
4360 int Offset) {
4361 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4362
4363 // Skip out of bounds offsets, or else we would end up using an undefined
4364 // register.
4365 if (Offset >= NumElts || Offset < 0)
4366 return std::pair(AMDGPU::sub0, Offset);
4367
4368 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4369}
4370
4373 int Offset) {
4374 MachineBasicBlock *MBB = MI.getParent();
4375 const DebugLoc &DL = MI.getDebugLoc();
4377
4378 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4379
4380 assert(Idx->getReg() != AMDGPU::NoRegister);
4381
4382 if (Offset == 0) {
4383 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4384 } else {
4385 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4386 .add(*Idx)
4387 .addImm(Offset);
4388 }
4389}
4390
4393 int Offset) {
4394 MachineBasicBlock *MBB = MI.getParent();
4395 const DebugLoc &DL = MI.getDebugLoc();
4397
4398 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4399
4400 if (Offset == 0)
4401 return Idx->getReg();
4402
4403 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4404 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4405 .add(*Idx)
4406 .addImm(Offset);
4407 return Tmp;
4408}
4409
4412 const GCNSubtarget &ST) {
4413 const SIInstrInfo *TII = ST.getInstrInfo();
4414 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4417
4418 Register Dst = MI.getOperand(0).getReg();
4419 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4420 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4421 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4422
4423 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4424 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4425
4426 unsigned SubReg;
4427 std::tie(SubReg, Offset)
4428 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4429
4430 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4431
4432 // Check for a SGPR index.
4433 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4435 const DebugLoc &DL = MI.getDebugLoc();
4436
4437 if (UseGPRIdxMode) {
4438 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4439 // to avoid interfering with other uses, so probably requires a new
4440 // optimization pass.
4442
4443 const MCInstrDesc &GPRIDXDesc =
4444 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4445 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4446 .addReg(SrcReg)
4447 .addReg(Idx)
4448 .addImm(SubReg);
4449 } else {
4451
4452 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4453 .addReg(SrcReg, 0, SubReg)
4454 .addReg(SrcReg, RegState::Implicit);
4455 }
4456
4457 MI.eraseFromParent();
4458
4459 return &MBB;
4460 }
4461
4462 // Control flow needs to be inserted if indexing with a VGPR.
4463 const DebugLoc &DL = MI.getDebugLoc();
4465
4466 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4467 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4468
4469 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4470
4471 Register SGPRIdxReg;
4472 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4473 UseGPRIdxMode, SGPRIdxReg);
4474
4475 MachineBasicBlock *LoopBB = InsPt->getParent();
4476
4477 if (UseGPRIdxMode) {
4478 const MCInstrDesc &GPRIDXDesc =
4479 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4480
4481 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4482 .addReg(SrcReg)
4483 .addReg(SGPRIdxReg)
4484 .addImm(SubReg);
4485 } else {
4486 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4487 .addReg(SrcReg, 0, SubReg)
4488 .addReg(SrcReg, RegState::Implicit);
4489 }
4490
4491 MI.eraseFromParent();
4492
4493 return LoopBB;
4494}
4495
4498 const GCNSubtarget &ST) {
4499 const SIInstrInfo *TII = ST.getInstrInfo();
4500 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4503
4504 Register Dst = MI.getOperand(0).getReg();
4505 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4506 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4507 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4508 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4509 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4510 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4511
4512 // This can be an immediate, but will be folded later.
4513 assert(Val->getReg());
4514
4515 unsigned SubReg;
4516 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4517 SrcVec->getReg(),
4518 Offset);
4519 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4520
4521 if (Idx->getReg() == AMDGPU::NoRegister) {
4523 const DebugLoc &DL = MI.getDebugLoc();
4524
4525 assert(Offset == 0);
4526
4527 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4528 .add(*SrcVec)
4529 .add(*Val)
4530 .addImm(SubReg);
4531
4532 MI.eraseFromParent();
4533 return &MBB;
4534 }
4535
4536 // Check for a SGPR index.
4537 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4539 const DebugLoc &DL = MI.getDebugLoc();
4540
4541 if (UseGPRIdxMode) {
4543
4544 const MCInstrDesc &GPRIDXDesc =
4545 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4546 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4547 .addReg(SrcVec->getReg())
4548 .add(*Val)
4549 .addReg(Idx)
4550 .addImm(SubReg);
4551 } else {
4553
4554 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4555 TRI.getRegSizeInBits(*VecRC), 32, false);
4556 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4557 .addReg(SrcVec->getReg())
4558 .add(*Val)
4559 .addImm(SubReg);
4560 }
4561 MI.eraseFromParent();
4562 return &MBB;
4563 }
4564
4565 // Control flow needs to be inserted if indexing with a VGPR.
4566 if (Val->isReg())
4567 MRI.clearKillFlags(Val->getReg());
4568
4569 const DebugLoc &DL = MI.getDebugLoc();
4570
4571 Register PhiReg = MRI.createVirtualRegister(VecRC);
4572
4573 Register SGPRIdxReg;
4574 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4575 UseGPRIdxMode, SGPRIdxReg);
4576 MachineBasicBlock *LoopBB = InsPt->getParent();
4577
4578 if (UseGPRIdxMode) {
4579 const MCInstrDesc &GPRIDXDesc =
4580 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4581
4582 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4583 .addReg(PhiReg)
4584 .add(*Val)
4585 .addReg(SGPRIdxReg)
4586 .addImm(AMDGPU::sub0);
4587 } else {
4588 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4589 TRI.getRegSizeInBits(*VecRC), 32, false);
4590 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4591 .addReg(PhiReg)
4592 .add(*Val)
4593 .addImm(AMDGPU::sub0);
4594 }
4595
4596 MI.eraseFromParent();
4597 return LoopBB;
4598}
4599
4602 const GCNSubtarget &ST,
4603 unsigned Opc) {
4605 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4606 const DebugLoc &DL = MI.getDebugLoc();
4607 const SIInstrInfo *TII = ST.getInstrInfo();
4608
4609 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4610 Register SrcReg = MI.getOperand(1).getReg();
4611 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4612 Register DstReg = MI.getOperand(0).getReg();
4613 MachineBasicBlock *RetBB = nullptr;
4614 if (isSGPR) {
4615 // These operations with a uniform value i.e. SGPR are idempotent.
4616 // Reduced value will be same as given sgpr.
4617 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4618 RetBB = &BB;
4619 } else {
4620 // TODO: Implement DPP Strategy and switch based on immediate strategy
4621 // operand. For now, for all the cases (default, Iterative and DPP we use
4622 // iterative approach by default.)
4623
4624 // To reduce the VGPR using iterative approach, we need to iterate
4625 // over all the active lanes. Lowering consists of ComputeLoop,
4626 // which iterate over only active lanes. We use copy of EXEC register
4627 // as induction variable and every active lane modifies it using bitset0
4628 // so that we will get the next active lane for next iteration.
4630 Register SrcReg = MI.getOperand(1).getReg();
4631
4632 // Create Control flow for loop
4633 // Split MI's Machine Basic block into For loop
4634 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4635
4636 // Create virtual registers required for lowering.
4637 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4638 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4639 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4640 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4641
4642 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4643 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4644 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4645
4646 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4647 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4648
4649 bool IsWave32 = ST.isWave32();
4650 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4651 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4652
4653 // Create initail values of induction variable from Exec, Accumulator and
4654 // insert branch instr to newly created ComputeBlockk
4655 uint32_t InitalValue =
4656 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4657 auto TmpSReg =
4658 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4659 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4660 .addImm(InitalValue);
4661 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4662
4663 // Start constructing ComputeLoop
4664 I = ComputeLoop->end();
4665 auto Accumulator =
4666 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4667 .addReg(InitalValReg)
4668 .addMBB(&BB);
4669 auto ActiveBits =
4670 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4671 .addReg(TmpSReg->getOperand(0).getReg())
4672 .addMBB(&BB);
4673
4674 // Perform the computations
4675 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4676 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4677 .addReg(ActiveBits->getOperand(0).getReg());
4678 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4679 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4680 .addReg(SrcReg)
4681 .addReg(FF1->getOperand(0).getReg());
4682 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4683 .addReg(Accumulator->getOperand(0).getReg())
4684 .addReg(LaneValue->getOperand(0).getReg());
4685
4686 // Manipulate the iterator to get the next active lane
4687 unsigned BITSETOpc =
4688 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4689 auto NewActiveBits =
4690 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4691 .addReg(FF1->getOperand(0).getReg())
4692 .addReg(ActiveBits->getOperand(0).getReg());
4693
4694 // Add phi nodes
4695 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4696 .addMBB(ComputeLoop);
4697 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4698 .addMBB(ComputeLoop);
4699
4700 // Creating branching
4701 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4702 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4703 .addReg(NewActiveBits->getOperand(0).getReg())
4704 .addImm(0);
4705 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4706 .addMBB(ComputeLoop);
4707
4708 RetBB = ComputeEnd;
4709 }
4710 MI.eraseFromParent();
4711 return RetBB;
4712}
4713
4715 MachineInstr &MI, MachineBasicBlock *BB) const {
4716
4718 MachineFunction *MF = BB->getParent();
4720
4721 switch (MI.getOpcode()) {
4722 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4723 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4724 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4725 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4726 case AMDGPU::S_UADDO_PSEUDO:
4727 case AMDGPU::S_USUBO_PSEUDO: {
4728 const DebugLoc &DL = MI.getDebugLoc();
4729 MachineOperand &Dest0 = MI.getOperand(0);
4730 MachineOperand &Dest1 = MI.getOperand(1);
4731 MachineOperand &Src0 = MI.getOperand(2);
4732 MachineOperand &Src1 = MI.getOperand(3);
4733
4734 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4735 ? AMDGPU::S_ADD_I32
4736 : AMDGPU::S_SUB_I32;
4737 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4738
4739 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4740 .addImm(1)
4741 .addImm(0);
4742
4743 MI.eraseFromParent();
4744 return BB;
4745 }
4746 case AMDGPU::S_ADD_U64_PSEUDO:
4747 case AMDGPU::S_SUB_U64_PSEUDO: {
4748 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4749 // For GFX12, we emit s_add_u64 and s_sub_u64.
4750 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4752 const DebugLoc &DL = MI.getDebugLoc();
4753 MachineOperand &Dest = MI.getOperand(0);
4754 MachineOperand &Src0 = MI.getOperand(1);
4755 MachineOperand &Src1 = MI.getOperand(2);
4756 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4757 if (Subtarget->hasScalarAddSub64()) {
4758 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4759 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4760 .addReg(Src0.getReg())
4761 .addReg(Src1.getReg());
4762 } else {
4763 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4764 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4765
4766 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4767 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4768
4769 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4770 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4771 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4772 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4773
4774 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4775 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4776 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4777 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4778
4779 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4780 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4781 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4782 .add(Src0Sub0)
4783 .add(Src1Sub0);
4784 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4785 .add(Src0Sub1)
4786 .add(Src1Sub1);
4787 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4788 .addReg(DestSub0)
4789 .addImm(AMDGPU::sub0)
4790 .addReg(DestSub1)
4791 .addImm(AMDGPU::sub1);
4792 }
4793 MI.eraseFromParent();
4794 return BB;
4795 }
4796 case AMDGPU::V_ADD_U64_PSEUDO:
4797 case AMDGPU::V_SUB_U64_PSEUDO: {
4799 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4800 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4801 const DebugLoc &DL = MI.getDebugLoc();
4802
4803 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4804
4805 MachineOperand &Dest = MI.getOperand(0);
4806 MachineOperand &Src0 = MI.getOperand(1);
4807 MachineOperand &Src1 = MI.getOperand(2);
4808
4809 if (IsAdd && ST.hasLshlAddB64()) {
4810 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4811 Dest.getReg())
4812 .add(Src0)
4813 .addImm(0)
4814 .add(Src1);
4815 TII->legalizeOperands(*Add);
4816 MI.eraseFromParent();
4817 return BB;
4818 }
4819
4820 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4821
4822 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4823 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4824
4825 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4826 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4827
4828 const TargetRegisterClass *Src0RC = Src0.isReg()
4829 ? MRI.getRegClass(Src0.getReg())
4830 : &AMDGPU::VReg_64RegClass;
4831 const TargetRegisterClass *Src1RC = Src1.isReg()
4832 ? MRI.getRegClass(Src1.getReg())
4833 : &AMDGPU::VReg_64RegClass;
4834
4835 const TargetRegisterClass *Src0SubRC =
4836 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4837 const TargetRegisterClass *Src1SubRC =
4838 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4839
4840 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4841 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4842 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4843 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4844
4845 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4846 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4847 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4848 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4849
4850 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4851 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4852 .addReg(CarryReg, RegState::Define)
4853 .add(SrcReg0Sub0)
4854 .add(SrcReg1Sub0)
4855 .addImm(0); // clamp bit
4856
4857 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4858 MachineInstr *HiHalf =
4859 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4860 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4861 .add(SrcReg0Sub1)
4862 .add(SrcReg1Sub1)
4863 .addReg(CarryReg, RegState::Kill)
4864 .addImm(0); // clamp bit
4865
4866 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4867 .addReg(DestSub0)
4868 .addImm(AMDGPU::sub0)
4869 .addReg(DestSub1)
4870 .addImm(AMDGPU::sub1);
4871 TII->legalizeOperands(*LoHalf);
4872 TII->legalizeOperands(*HiHalf);
4873 MI.eraseFromParent();
4874 return BB;
4875 }
4876 case AMDGPU::S_ADD_CO_PSEUDO:
4877 case AMDGPU::S_SUB_CO_PSEUDO: {
4878 // This pseudo has a chance to be selected
4879 // only from uniform add/subcarry node. All the VGPR operands
4880 // therefore assumed to be splat vectors.
4882 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4883 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4885 const DebugLoc &DL = MI.getDebugLoc();
4886 MachineOperand &Dest = MI.getOperand(0);
4887 MachineOperand &CarryDest = MI.getOperand(1);
4888 MachineOperand &Src0 = MI.getOperand(2);
4889 MachineOperand &Src1 = MI.getOperand(3);
4890 MachineOperand &Src2 = MI.getOperand(4);
4891 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4892 ? AMDGPU::S_ADDC_U32
4893 : AMDGPU::S_SUBB_U32;
4894 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4895 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4896 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4897 .addReg(Src0.getReg());
4898 Src0.setReg(RegOp0);
4899 }
4900 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4901 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4902 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4903 .addReg(Src1.getReg());
4904 Src1.setReg(RegOp1);
4905 }
4906 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4907 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4908 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4909 .addReg(Src2.getReg());
4910 Src2.setReg(RegOp2);
4911 }
4912
4913 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4914 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4915 assert(WaveSize == 64 || WaveSize == 32);
4916
4917 if (WaveSize == 64) {
4918 if (ST.hasScalarCompareEq64()) {
4919 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4920 .addReg(Src2.getReg())
4921 .addImm(0);
4922 } else {
4923 const TargetRegisterClass *SubRC =
4924 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4925 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4926 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4927 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4928 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4929 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4930
4931 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4932 .add(Src2Sub0)
4933 .add(Src2Sub1);
4934
4935 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4936 .addReg(Src2_32, RegState::Kill)
4937 .addImm(0);
4938 }
4939 } else {
4940 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4941 .addReg(Src2.getReg())
4942 .addImm(0);
4943 }
4944
4945 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4946
4947 unsigned SelOpc =
4948 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4949
4950 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
4951 .addImm(-1)
4952 .addImm(0);
4953
4954 MI.eraseFromParent();
4955 return BB;
4956 }
4957 case AMDGPU::SI_INIT_M0: {
4958 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
4959 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4960 .add(MI.getOperand(0));
4961 MI.eraseFromParent();
4962 return BB;
4963 }
4964 case AMDGPU::GET_GROUPSTATICSIZE: {
4965 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4966 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4967 DebugLoc DL = MI.getDebugLoc();
4968 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4969 .add(MI.getOperand(0))
4970 .addImm(MFI->getLDSSize());
4971 MI.eraseFromParent();
4972 return BB;
4973 }
4974 case AMDGPU::GET_SHADERCYCLESHILO: {
4977 const DebugLoc &DL = MI.getDebugLoc();
4978 // The algorithm is:
4979 //
4980 // hi1 = getreg(SHADER_CYCLES_HI)
4981 // lo1 = getreg(SHADER_CYCLES_LO)
4982 // hi2 = getreg(SHADER_CYCLES_HI)
4983 //
4984 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
4985 // Otherwise there was overflow and the result is hi2:0. In both cases the
4986 // result should represent the actual time at some point during the sequence
4987 // of three getregs.
4988 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4989 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
4991 0, 32));
4992 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4993 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
4994 .addImm(
4996 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4997 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
4999 0, 32));
5000 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5001 .addReg(RegHi1)
5002 .addReg(RegHi2);
5003 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5004 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5005 .addReg(RegLo1)
5006 .addImm(0);
5007 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5008 .add(MI.getOperand(0))
5009 .addReg(RegLo)
5010 .addImm(AMDGPU::sub0)
5011 .addReg(RegHi2)
5012 .addImm(AMDGPU::sub1);
5013 MI.eraseFromParent();
5014 return BB;
5015 }
5016 case AMDGPU::SI_INDIRECT_SRC_V1:
5017 case AMDGPU::SI_INDIRECT_SRC_V2:
5018 case AMDGPU::SI_INDIRECT_SRC_V4:
5019 case AMDGPU::SI_INDIRECT_SRC_V8:
5020 case AMDGPU::SI_INDIRECT_SRC_V9:
5021 case AMDGPU::SI_INDIRECT_SRC_V10:
5022 case AMDGPU::SI_INDIRECT_SRC_V11:
5023 case AMDGPU::SI_INDIRECT_SRC_V12:
5024 case AMDGPU::SI_INDIRECT_SRC_V16:
5025 case AMDGPU::SI_INDIRECT_SRC_V32:
5026 return emitIndirectSrc(MI, *BB, *getSubtarget());
5027 case AMDGPU::SI_INDIRECT_DST_V1:
5028 case AMDGPU::SI_INDIRECT_DST_V2:
5029 case AMDGPU::SI_INDIRECT_DST_V4:
5030 case AMDGPU::SI_INDIRECT_DST_V8:
5031 case AMDGPU::SI_INDIRECT_DST_V9:
5032 case AMDGPU::SI_INDIRECT_DST_V10:
5033 case AMDGPU::SI_INDIRECT_DST_V11:
5034 case AMDGPU::SI_INDIRECT_DST_V12:
5035 case AMDGPU::SI_INDIRECT_DST_V16:
5036 case AMDGPU::SI_INDIRECT_DST_V32:
5037 return emitIndirectDst(MI, *BB, *getSubtarget());
5038 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5039 case AMDGPU::SI_KILL_I1_PSEUDO:
5040 return splitKillBlock(MI, BB);
5041 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5045
5046 Register Dst = MI.getOperand(0).getReg();
5047 const MachineOperand &Src0 = MI.getOperand(1);
5048 const MachineOperand &Src1 = MI.getOperand(2);
5049 const DebugLoc &DL = MI.getDebugLoc();
5050 Register SrcCond = MI.getOperand(3).getReg();
5051
5052 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5053 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5054 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5055 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5056
5057 const TargetRegisterClass *Src0RC = Src0.isReg()
5058 ? MRI.getRegClass(Src0.getReg())
5059 : &AMDGPU::VReg_64RegClass;
5060 const TargetRegisterClass *Src1RC = Src1.isReg()
5061 ? MRI.getRegClass(Src1.getReg())
5062 : &AMDGPU::VReg_64RegClass;
5063
5064 const TargetRegisterClass *Src0SubRC =
5065 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5066 const TargetRegisterClass *Src1SubRC =
5067 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5068
5069 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5070 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5071 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5072 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5073
5074 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5075 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5076 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5077 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5078
5079 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5080 .addReg(SrcCond);
5081 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5082 .addImm(0)
5083 .add(Src0Sub0)
5084 .addImm(0)
5085 .add(Src1Sub0)
5086 .addReg(SrcCondCopy);
5087 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5088 .addImm(0)
5089 .add(Src0Sub1)
5090 .addImm(0)
5091 .add(Src1Sub1)
5092 .addReg(SrcCondCopy);
5093
5094 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5095 .addReg(DstLo)
5096 .addImm(AMDGPU::sub0)
5097 .addReg(DstHi)
5098 .addImm(AMDGPU::sub1);
5099 MI.eraseFromParent();
5100 return BB;
5101 }
5102 case AMDGPU::SI_BR_UNDEF: {
5104 const DebugLoc &DL = MI.getDebugLoc();
5105 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5106 .add(MI.getOperand(0));
5107 Br->getOperand(1).setIsUndef(); // read undef SCC
5108 MI.eraseFromParent();
5109 return BB;
5110 }
5111 case AMDGPU::ADJCALLSTACKUP:
5112 case AMDGPU::ADJCALLSTACKDOWN: {
5114 MachineInstrBuilder MIB(*MF, &MI);
5115 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5116 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5117 return BB;
5118 }
5119 case AMDGPU::SI_CALL_ISEL: {
5121 const DebugLoc &DL = MI.getDebugLoc();
5122
5123 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5124
5126 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5127
5128 for (const MachineOperand &MO : MI.operands())
5129 MIB.add(MO);
5130
5131 MIB.cloneMemRefs(MI);
5132 MI.eraseFromParent();
5133 return BB;
5134 }
5135 case AMDGPU::V_ADD_CO_U32_e32:
5136 case AMDGPU::V_SUB_CO_U32_e32:
5137 case AMDGPU::V_SUBREV_CO_U32_e32: {
5138 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5139 const DebugLoc &DL = MI.getDebugLoc();
5140 unsigned Opc = MI.getOpcode();
5141
5142 bool NeedClampOperand = false;
5143 if (TII->pseudoToMCOpcode(Opc) == -1) {
5144 Opc = AMDGPU::getVOPe64(Opc);
5145 NeedClampOperand = true;
5146 }
5147
5148 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5149 if (TII->isVOP3(*I)) {
5150 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5151 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5152 I.addReg(TRI->getVCC(), RegState::Define);
5153 }
5154 I.add(MI.getOperand(1))
5155 .add(MI.getOperand(2));
5156 if (NeedClampOperand)
5157 I.addImm(0); // clamp bit for e64 encoding
5158
5159 TII->legalizeOperands(*I);
5160
5161 MI.eraseFromParent();
5162 return BB;
5163 }
5164 case AMDGPU::V_ADDC_U32_e32:
5165 case AMDGPU::V_SUBB_U32_e32:
5166 case AMDGPU::V_SUBBREV_U32_e32:
5167 // These instructions have an implicit use of vcc which counts towards the
5168 // constant bus limit.
5169 TII->legalizeOperands(MI);
5170 return BB;
5171 case AMDGPU::DS_GWS_INIT:
5172 case AMDGPU::DS_GWS_SEMA_BR:
5173 case AMDGPU::DS_GWS_BARRIER:
5174 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5175 [[fallthrough]];
5176 case AMDGPU::DS_GWS_SEMA_V:
5177 case AMDGPU::DS_GWS_SEMA_P:
5178 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5179 // A s_waitcnt 0 is required to be the instruction immediately following.
5180 if (getSubtarget()->hasGWSAutoReplay()) {
5182 return BB;
5183 }
5184
5185 return emitGWSMemViolTestLoop(MI, BB);
5186 case AMDGPU::S_SETREG_B32: {
5187 // Try to optimize cases that only set the denormal mode or rounding mode.
5188 //
5189 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5190 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5191 // instead.
5192 //
5193 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5194 // allow you to have a no side effect instruction in the output of a
5195 // sideeffecting pattern.
5196 unsigned ID, Offset, Width;
5197 AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
5199 return BB;
5200
5201 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5202 const unsigned SetMask = WidthMask << Offset;
5203
5204 if (getSubtarget()->hasDenormModeInst()) {
5205 unsigned SetDenormOp = 0;
5206 unsigned SetRoundOp = 0;
5207
5208 // The dedicated instructions can only set the whole denorm or round mode
5209 // at once, not a subset of bits in either.
5210 if (SetMask ==
5212 // If this fully sets both the round and denorm mode, emit the two
5213 // dedicated instructions for these.
5214 SetRoundOp = AMDGPU::S_ROUND_MODE;
5215 SetDenormOp = AMDGPU::S_DENORM_MODE;
5216 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5217 SetRoundOp = AMDGPU::S_ROUND_MODE;
5218 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5219 SetDenormOp = AMDGPU::S_DENORM_MODE;
5220 }
5221
5222 if (SetRoundOp || SetDenormOp) {
5224 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5225 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5226 unsigned ImmVal = Def->getOperand(1).getImm();
5227 if (SetRoundOp) {
5228 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5229 .addImm(ImmVal & 0xf);
5230
5231 // If we also have the denorm mode, get just the denorm mode bits.
5232 ImmVal >>= 4;
5233 }
5234
5235 if (SetDenormOp) {
5236 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5237 .addImm(ImmVal & 0xf);
5238 }
5239
5240 MI.eraseFromParent();
5241 return BB;
5242 }
5243 }
5244 }
5245
5246 // If only FP bits are touched, used the no side effects pseudo.
5247 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5248 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5249 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5250
5251 return BB;
5252 }
5253 case AMDGPU::S_INVERSE_BALLOT_U32:
5254 case AMDGPU::S_INVERSE_BALLOT_U64: {
5256 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5257 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5258 const DebugLoc &DL = MI.getDebugLoc();
5259 const Register DstReg = MI.getOperand(0).getReg();
5260 Register MaskReg = MI.getOperand(1).getReg();
5261
5262 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5263
5264 if (IsVALU) {
5265 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5266 }
5267
5268 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5269 MI.eraseFromParent();
5270 return BB;
5271 }
5272 case AMDGPU::ENDPGM_TRAP: {
5273 const DebugLoc &DL = MI.getDebugLoc();
5274 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5275 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5276 MI.addOperand(MachineOperand::CreateImm(0));
5277 return BB;
5278 }
5279
5280 // We need a block split to make the real endpgm a terminator. We also don't
5281 // want to break phis in successor blocks, so we can't just delete to the
5282 // end of the block.
5283
5284 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5286 MF->push_back(TrapBB);
5287 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5288 .addImm(0);
5289 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5290 .addMBB(TrapBB);
5291
5292 BB->addSuccessor(TrapBB);
5293 MI.eraseFromParent();
5294 return SplitBB;
5295 }
5296 default:
5298 }
5299}
5300
5302 switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
5303 case MVT::f32:
5304 return Subtarget->hasAtomicFaddRtnInsts();
5305 case MVT::v2f16:
5306 case MVT::f64:
5307 return Subtarget->hasGFX90AInsts();
5308 default:
5309 return false;
5310 }
5311}
5312
5314 // This currently forces unfolding various combinations of fsub into fma with
5315 // free fneg'd operands. As long as we have fast FMA (controlled by
5316 // isFMAFasterThanFMulAndFAdd), we should perform these.
5317
5318 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5319 // most of these combines appear to be cycle neutral but save on instruction
5320 // count / code size.
5321 return true;
5322}
5323
5325
5327 EVT VT) const {
5328 if (!VT.isVector()) {
5329 return MVT::i1;
5330 }
5331 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5332}
5333
5335 // TODO: Should i16 be used always if legal? For now it would force VALU
5336 // shifts.
5337 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5338}
5339
5341 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5342 ? Ty.changeElementSize(16)
5343 : Ty.changeElementSize(32);
5344}
5345
5346// Answering this is somewhat tricky and depends on the specific device which
5347// have different rates for fma or all f64 operations.
5348//
5349// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5350// regardless of which device (although the number of cycles differs between
5351// devices), so it is always profitable for f64.
5352//
5353// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5354// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5355// which we can always do even without fused FP ops since it returns the same
5356// result as the separate operations and since it is always full
5357// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5358// however does not support denormals, so we do report fma as faster if we have
5359// a fast fma device and require denormals.
5360//
5362 EVT VT) const {
5363 VT = VT.getScalarType();
5364
5365 switch (VT.getSimpleVT().SimpleTy) {
5366 case MVT::f32: {
5367 // If mad is not available this depends only on if f32 fma is full rate.
5368 if (!Subtarget->hasMadMacF32Insts())
5369 return Subtarget->hasFastFMAF32();
5370
5371 // Otherwise f32 mad is always full rate and returns the same result as
5372 // the separate operations so should be preferred over fma.
5373 // However does not support denormals.
5375 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5376
5377 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5378 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5379 }
5380 case MVT::f64:
5381 return true;
5382 case MVT::f16:
5383 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5384 default:
5385 break;
5386 }
5387
5388 return false;
5389}
5390
5392 LLT Ty) const {
5393 switch (Ty.getScalarSizeInBits()) {
5394 case 16:
5395 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5396 case 32:
5397 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5398 case 64:
5399 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5400 default:
5401 break;
5402 }
5403
5404 return false;
5405}
5406
5408 if (!Ty.isScalar())
5409 return false;
5410
5411 if (Ty.getScalarSizeInBits() == 16)
5412 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5413 if (Ty.getScalarSizeInBits() == 32)
5414 return Subtarget->hasMadMacF32Insts() &&
5415 denormalModeIsFlushAllF32(*MI.getMF());
5416
5417 return false;
5418}
5419
5421 const SDNode *N) const {
5422 // TODO: Check future ftz flag
5423 // v_mad_f32/v_mac_f32 do not support denormals.
5424 EVT VT = N->getValueType(0);
5425 if (VT == MVT::f32)
5426 return Subtarget->hasMadMacF32Insts() &&
5428 if (VT == MVT::f16) {
5429 return Subtarget->hasMadF16() &&
5431 }
5432
5433 return false;
5434}
5435
5436//===----------------------------------------------------------------------===//
5437// Custom DAG Lowering Operations
5438//===----------------------------------------------------------------------===//
5439
5440// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5441// wider vector type is legal.
5443 SelectionDAG &DAG) const {
5444 unsigned Opc = Op.getOpcode();
5445 EVT VT = Op.getValueType();
5446 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5447 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5448 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5449 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5450
5451 SDValue Lo, Hi;
5452 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5453
5454 SDLoc SL(Op);
5455 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5456 Op->getFlags());
5457 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5458 Op->getFlags());
5459
5460 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5461}
5462
5463// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5464// wider vector type is legal.
5466 SelectionDAG &DAG) const {
5467 unsigned Opc = Op.getOpcode();
5468 EVT VT = Op.getValueType();
5469 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5470 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5471 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5472 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5473
5474 SDValue Lo0, Hi0;
5475 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5476 SDValue Lo1, Hi1;
5477 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5478
5479 SDLoc SL(Op);
5480
5481 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5482 Op->getFlags());
5483 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5484 Op->getFlags());
5485
5486 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5487}
5488
5490 SelectionDAG &DAG) const {
5491 unsigned Opc = Op.getOpcode();
5492 EVT VT = Op.getValueType();
5493 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5494 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5495 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5496 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5497 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5498 VT == MVT::v32bf16);
5499
5500 SDValue Lo0, Hi0;
5501 SDValue Op0 = Op.getOperand(0);
5502 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5503 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5504 : std::pair(Op0, Op0);
5505 SDValue Lo1, Hi1;
5506 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5507 SDValue Lo2, Hi2;
5508 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5509
5510 SDLoc SL(Op);
5511 auto ResVT = DAG.GetSplitDestVTs(VT);
5512
5513 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5514 Op->getFlags());
5515 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5516 Op->getFlags());
5517
5518 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5519}
5520
5521
5523 switch (Op.getOpcode()) {
5524 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5525 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5526 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5527 case ISD::LOAD: {
5528 SDValue Result = LowerLOAD(Op, DAG);
5529 assert((!Result.getNode() ||
5530 Result.getNode()->getNumValues() == 2) &&
5531 "Load should return a value and a chain");
5532 return Result;
5533 }
5534 case ISD::FSQRT: {
5535 EVT VT = Op.getValueType();
5536 if (VT == MVT::f32)
5537 return lowerFSQRTF32(Op, DAG);
5538 if (VT == MVT::f64)
5539 return lowerFSQRTF64(Op, DAG);
5540 return SDValue();
5541 }
5542 case ISD::FSIN:
5543 case ISD::FCOS:
5544 return LowerTrig(Op, DAG);
5545 case ISD::SELECT: return LowerSELECT(Op, DAG);
5546 case ISD::FDIV: return LowerFDIV(Op, DAG);
5547 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5548 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5549 case ISD::STORE: return LowerSTORE(Op, DAG);
5550 case ISD::GlobalAddress: {
5553 return LowerGlobalAddress(MFI, Op, DAG);
5554 }
5555 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5556 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5557 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5558 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5560 return lowerINSERT_SUBVECTOR(Op, DAG);
5562 return lowerINSERT_VECTOR_ELT(Op, DAG);
5564 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5566 return lowerVECTOR_SHUFFLE(Op, DAG);
5568 return lowerSCALAR_TO_VECTOR(Op, DAG);
5569 case ISD::BUILD_VECTOR:
5570 return lowerBUILD_VECTOR(Op, DAG);
5571 case ISD::FP_ROUND:
5573 return lowerFP_ROUND(Op, DAG);
5574 case ISD::FPTRUNC_ROUND: {
5575 unsigned Opc;
5576 SDLoc DL(Op);
5577
5578 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5579 return SDValue();
5580
5581 // Get the rounding mode from the last operand
5582 int RoundMode = Op.getConstantOperandVal(1);
5583 if (RoundMode == (int)RoundingMode::TowardPositive)
5585 else if (RoundMode == (int)RoundingMode::TowardNegative)
5587 else
5588 return SDValue();
5589
5590 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5591 }
5592 case ISD::TRAP:
5593 return lowerTRAP(Op, DAG);
5594 case ISD::DEBUGTRAP:
5595 return lowerDEBUGTRAP(Op, DAG);
5596 case ISD::FABS:
5597 case ISD::FNEG:
5598 case ISD::FCANONICALIZE:
5599 case ISD::BSWAP:
5600 return splitUnaryVectorOp(Op, DAG);
5601 case ISD::FMINNUM:
5602 case ISD::FMAXNUM:
5603 return lowerFMINNUM_FMAXNUM(Op, DAG);
5604 case ISD::FLDEXP:
5605 case ISD::STRICT_FLDEXP:
5606 return lowerFLDEXP(Op, DAG);
5607 case ISD::FMA:
5608 return splitTernaryVectorOp(Op, DAG);
5609 case ISD::FP_TO_SINT:
5610 case ISD::FP_TO_UINT:
5611 return LowerFP_TO_INT(Op, DAG);
5612 case ISD::SHL:
5613 case ISD::SRA:
5614 case ISD::SRL:
5615 case ISD::ADD:
5616 case ISD::SUB:
5617 case ISD::SMIN:
5618 case ISD::SMAX:
5619 case ISD::UMIN:
5620 case ISD::UMAX:
5621 case ISD::FADD:
5622 case ISD::FMUL:
5623 case ISD::FMINNUM_IEEE:
5624 case ISD::FMAXNUM_IEEE:
5625 case ISD::UADDSAT:
5626 case ISD::USUBSAT:
5627 case ISD::SADDSAT:
5628 case ISD::SSUBSAT:
5629 return splitBinaryVectorOp(Op, DAG);
5630 case ISD::MUL:
5631 return lowerMUL(Op, DAG);
5632 case ISD::SMULO:
5633 case ISD::UMULO:
5634 return lowerXMULO(Op, DAG);
5635 case ISD::SMUL_LOHI:
5636 case ISD::UMUL_LOHI:
5637 return lowerXMUL_LOHI(Op, DAG);
5639 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5640 case ISD::STACKSAVE:
5641 return LowerSTACKSAVE(Op, DAG);
5642 case ISD::GET_ROUNDING:
5643 return lowerGET_ROUNDING(Op, DAG);
5644 case ISD::PREFETCH:
5645 return lowerPREFETCH(Op, DAG);
5646 case ISD::FP_EXTEND:
5648 return lowerFP_EXTEND(Op, DAG);
5649 }
5650 return SDValue();
5651}
5652
5653// Used for D16: Casts the result of an instruction into the right vector,
5654// packs values if loads return unpacked values.
5656 const SDLoc &DL,
5657 SelectionDAG &DAG, bool Unpacked) {
5658 if (!LoadVT.isVector())
5659 return Result;
5660
5661 // Cast back to the original packed type or to a larger type that is a
5662 // multiple of 32 bit for D16. Widening the return type is a required for
5663 // legalization.
5664 EVT FittingLoadVT = LoadVT;
5665 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5666 FittingLoadVT =
5668 LoadVT.getVectorNumElements() + 1);
5669 }
5670
5671 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5672 // Truncate to v2i16/v4i16.
5673 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5674
5675 // Workaround legalizer not scalarizing truncate after vector op
5676 // legalization but not creating intermediate vector trunc.
5678 DAG.ExtractVectorElements(Result, Elts);
5679 for (SDValue &Elt : Elts)
5680 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5681
5682 // Pad illegal v1i16/v3fi6 to v4i16
5683 if ((LoadVT.getVectorNumElements() % 2) == 1)
5684 Elts.push_back(DAG.getUNDEF(MVT::i16));
5685
5686 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5687
5688 // Bitcast to original type (v2f16/v4f16).
5689 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5690 }
5691
5692 // Cast back to the original packed type.
5693 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5694}
5695
5696SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5697 MemSDNode *M,
5698 SelectionDAG &DAG,
5700 bool IsIntrinsic) const {
5701 SDLoc DL(M);
5702
5703 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5704 EVT LoadVT = M->getValueType(0);
5705
5706 EVT EquivLoadVT = LoadVT;
5707 if (LoadVT.isVector()) {
5708 if (Unpacked) {
5709 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5710 LoadVT.getVectorNumElements());
5711 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5712 // Widen v3f16 to legal type
5713 EquivLoadVT =
5715 LoadVT.getVectorNumElements() + 1);
5716 }
5717 }
5718
5719 // Change from v4f16/v2f16 to EquivLoadVT.
5720 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5721
5723 = DAG.getMemIntrinsicNode(
5724 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5725 VTList, Ops, M->getMemoryVT(),
5726 M->getMemOperand());
5727
5728 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5729
5730 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5731}
5732
5733SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5734 SelectionDAG &DAG,
5735 ArrayRef<SDValue> Ops) const {
5736 SDLoc DL(M);
5737 EVT LoadVT = M->getValueType(0);
5738 EVT EltType = LoadVT.getScalarType();
5739 EVT IntVT = LoadVT.changeTypeToInteger();
5740
5741 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5742
5743 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5744 bool IsTFE = M->getNumValues() == 3;
5745
5746 unsigned Opc;
5747 if (IsFormat) {
5750 } else {
5751 // TODO: Support non-format TFE loads.
5752 if (IsTFE)
5753 return SDValue();
5755 }
5756
5757 if (IsD16) {
5758 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5759 }
5760
5761 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5762 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5763 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5764
5765 if (isTypeLegal(LoadVT)) {
5766 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5767 M->getMemOperand(), DAG);
5768 }
5769
5770 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5771 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5772 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5773 M->getMemOperand(), DAG);
5774 return DAG.getMergeValues(
5775 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5776 DL);
5777}
5778
5780 SDNode *N, SelectionDAG &DAG) {
5781 EVT VT = N->getValueType(0);
5782 unsigned CondCode = N->getConstantOperandVal(3);
5783 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5784 return DAG.getUNDEF(VT);
5785
5786 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5787
5788 SDValue LHS = N->getOperand(1);
5789 SDValue RHS = N->getOperand(2);
5790
5791 SDLoc DL(N);
5792
5793 EVT CmpVT = LHS.getValueType();
5794 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5795 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
5797 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5798 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5799 }
5800
5801 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
5802
5803 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5804 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5805
5806 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
5807 DAG.getCondCode(CCOpcode));
5808 if (VT.bitsEq(CCVT))
5809 return SetCC;
5810 return DAG.getZExtOrTrunc(SetCC, DL, VT);
5811}
5812
5814 SDNode *N, SelectionDAG &DAG) {
5815 EVT VT = N->getValueType(0);
5816
5817 unsigned CondCode = N->getConstantOperandVal(3);
5818 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
5819 return DAG.getUNDEF(VT);
5820
5821 SDValue Src0 = N->getOperand(1);
5822 SDValue Src1 = N->getOperand(2);
5823 EVT CmpVT = Src0.getValueType();
5824 SDLoc SL(N);
5825
5826 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5827 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5828 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5829 }
5830
5831 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5832 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
5833 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5834 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5835 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
5836 Src1, DAG.getCondCode(CCOpcode));
5837 if (VT.bitsEq(CCVT))
5838 return SetCC;
5839 return DAG.getZExtOrTrunc(SetCC, SL, VT);
5840}
5841
5843 SelectionDAG &DAG) {
5844 EVT VT = N->getValueType(0);
5845 SDValue Src = N->getOperand(1);
5846 SDLoc SL(N);
5847
5848 if (Src.getOpcode() == ISD::SETCC) {
5849 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5850 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
5851 Src.getOperand(1), Src.getOperand(2));
5852 }
5853 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
5854 // (ballot 0) -> 0
5855 if (Arg->isZero())
5856 return DAG.getConstant(0, SL, VT);
5857
5858 // (ballot 1) -> EXEC/EXEC_LO
5859 if (Arg->isOne()) {
5860 Register Exec;
5861 if (VT.getScalarSizeInBits() == 32)
5862 Exec = AMDGPU::EXEC_LO;
5863 else if (VT.getScalarSizeInBits() == 64)
5864 Exec = AMDGPU::EXEC;
5865 else
5866 return SDValue();
5867
5868 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5869 }
5870 }
5871
5872 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5873 // ISD::SETNE)
5874 return DAG.getNode(
5875 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5876 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5877}
5878
5881 SelectionDAG &DAG) const {
5882 switch (N->getOpcode()) {
5884 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
5885 Results.push_back(Res);
5886 return;
5887 }
5889 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
5890 Results.push_back(Res);
5891 return;
5892 }
5894 unsigned IID = N->getConstantOperandVal(0);
5895 switch (IID) {
5896 case Intrinsic::amdgcn_make_buffer_rsrc:
5897 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
5898 return;
5899 case Intrinsic::amdgcn_cvt_pkrtz: {
5900 SDValue Src0 = N->getOperand(1);
5901 SDValue Src1 = N->getOperand(2);
5902 SDLoc SL(N);
5903 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
5904 Src0, Src1);
5905 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
5906 return;
5907 }
5908 case Intrinsic::amdgcn_cvt_pknorm_i16:
5909 case Intrinsic::amdgcn_cvt_pknorm_u16:
5910 case Intrinsic::amdgcn_cvt_pk_i16:
5911 case Intrinsic::amdgcn_cvt_pk_u16: {
5912 SDValue Src0 = N->getOperand(1);
5913 SDValue Src1 = N->getOperand(2);
5914 SDLoc SL(N);
5915 unsigned Opcode;
5916
5917 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
5919 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
5921 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
5923 else
5925
5926 EVT VT = N->getValueType(0);
5927 if (isTypeLegal(VT))
5928 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
5929 else {
5930 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
5931 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
5932 }
5933 return;
5934 }
5935 case Intrinsic::amdgcn_s_buffer_load: {
5936 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
5937 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
5938 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
5939 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
5940 // s_buffer_load_i8.
5941 if (!Subtarget->hasScalarSubwordLoads())
5942 return;
5943 SDValue Op = SDValue(N, 0);
5944 SDValue Rsrc = Op.getOperand(1);
5945 SDValue Offset = Op.getOperand(2);
5946 SDValue CachePolicy = Op.getOperand(3);
5947 EVT VT = Op.getValueType();
5948 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
5949 SDLoc DL(Op);
5951 const DataLayout &DataLayout = DAG.getDataLayout();
5952 Align Alignment =
5958 VT.getStoreSize(), Alignment);
5959 SDValue LoadVal;
5960 if (!Offset->isDivergent()) {
5961 SDValue Ops[] = {Rsrc, // source register
5962 Offset, CachePolicy};
5963 SDValue BufferLoad =
5965 DAG.getVTList(MVT::i32), Ops, VT, MMO);
5966 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
5967 } else {
5968 SDValue Ops[] = {
5969 DAG.getEntryNode(), // Chain
5970 Rsrc, // rsrc
5971 DAG.getConstant(0, DL, MVT::i32), // vindex
5972 {}, // voffset
5973 {}, // soffset
5974 {}, // offset
5975 CachePolicy, // cachepolicy
5976 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
5977 };
5978 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
5979 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
5980 }
5981 Results.push_back(LoadVal);
5982 return;
5983 }
5984 }
5985 break;
5986 }
5988 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
5989 if (Res.getOpcode() == ISD::MERGE_VALUES) {
5990 // FIXME: Hacky
5991 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
5992 Results.push_back(Res.getOperand(I));
5993 }
5994 } else {
5995 Results.push_back(Res);
5996 Results.push_back(Res.getValue(1));
5997 }
5998 return;
5999 }
6000
6001 break;
6002 }
6003 case ISD::SELECT: {
6004 SDLoc SL(N);
6005 EVT VT = N->getValueType(0);
6006 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6007 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6008 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6009
6010 EVT SelectVT = NewVT;
6011 if (NewVT.bitsLT(MVT::i32)) {
6012 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6013 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6014 SelectVT = MVT::i32;
6015 }
6016
6017 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6018 N->getOperand(0), LHS, RHS);
6019
6020 if (NewVT != SelectVT)
6021 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6022 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6023 return;
6024 }
6025 case ISD::FNEG: {
6026 if (N->getValueType(0) != MVT::v2f16)
6027 break;
6028
6029 SDLoc SL(N);
6030 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6031
6032 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6033 BC,
6034 DAG.getConstant(0x80008000, SL, MVT::i32));
6035 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6036 return;
6037 }
6038 case ISD::FABS: {
6039 if (N->getValueType(0) != MVT::v2f16)
6040 break;
6041
6042 SDLoc SL(N);
6043 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6044
6045 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6046 BC,
6047 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6048 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6049 return;
6050 }
6051 case ISD::FSQRT: {
6052 if (N->getValueType(0) != MVT::f16)
6053 break;
6054 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6055 break;
6056 }
6057 default:
6059 break;
6060 }
6061}
6062
6063/// Helper function for LowerBRCOND
6064static SDNode *findUser(SDValue Value, unsigned Opcode) {
6065
6066 SDNode *Parent = Value.getNode();
6067 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6068 I != E; ++I) {
6069
6070 if (I.getUse().get() != Value)
6071 continue;
6072
6073 if (I->getOpcode() == Opcode)
6074 return *I;
6075 }
6076 return nullptr;
6077}
6078
6079unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6080 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6081 switch (Intr->getConstantOperandVal(1)) {
6082 case Intrinsic::amdgcn_if:
6083 return AMDGPUISD::IF;
6084 case Intrinsic::amdgcn_else:
6085 return AMDGPUISD::ELSE;
6086 case Intrinsic::amdgcn_loop:
6087 return AMDGPUISD::LOOP;
6088 case Intrinsic::amdgcn_end_cf:
6089 llvm_unreachable("should not occur");
6090 default:
6091 return 0;
6092 }
6093 }
6094
6095 // break, if_break, else_break are all only used as inputs to loop, not
6096 // directly as branch conditions.
6097 return 0;
6098}
6099
6101 const Triple &TT = getTargetMachine().getTargetTriple();
6105}
6106
6108 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6109 return false;
6110
6111 // FIXME: Either avoid relying on address space here or change the default
6112 // address space for functions to avoid the explicit check.
6113 return (GV->getValueType()->isFunctionTy() ||
6115 !shouldEmitFixup(GV) &&
6117}
6118
6120 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6121}
6122
6124 if (!GV->hasExternalLinkage())
6125 return true;
6126
6127 const auto OS = getTargetMachine().getTargetTriple().getOS();
6128 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6129}
6130
6131/// This transforms the control flow intrinsics to get the branch destination as
6132/// last parameter, also switches branch target with BR if the need arise
6133SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6134 SelectionDAG &DAG) const {
6135 SDLoc DL(BRCOND);
6136
6137 SDNode *Intr = BRCOND.getOperand(1).getNode();
6138 SDValue Target = BRCOND.getOperand(2);
6139 SDNode *BR = nullptr;
6140 SDNode *SetCC = nullptr;
6141
6142 if (Intr->getOpcode() == ISD::SETCC) {
6143 // As long as we negate the condition everything is fine
6144 SetCC = Intr;
6145 Intr = SetCC->getOperand(0).getNode();
6146
6147 } else {
6148 // Get the target from BR if we don't negate the condition
6149 BR = findUser(BRCOND, ISD::BR);
6150 assert(BR && "brcond missing unconditional branch user");
6151 Target = BR->getOperand(1);
6152 }
6153
6154 unsigned CFNode = isCFIntrinsic(Intr);
6155 if (CFNode == 0) {
6156 // This is a uniform branch so we don't need to legalize.
6157 return BRCOND;
6158 }
6159
6160 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6161 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6162
6163 assert(!SetCC ||
6164 (SetCC->getConstantOperandVal(1) == 1 &&
6165 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6166 ISD::SETNE));
6167
6168 // operands of the new intrinsic call
6170 if (HaveChain)
6171 Ops.push_back(BRCOND.getOperand(0));
6172
6173 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6174 Ops.push_back(Target);
6175
6176 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6177
6178 // build the new intrinsic call
6179 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6180
6181 if (!HaveChain) {
6182 SDValue Ops[] = {
6183 SDValue(Result, 0),
6184 BRCOND.getOperand(0)
6185 };
6186
6187 Result = DAG.getMergeValues(Ops, DL).getNode();
6188 }
6189
6190 if (BR) {
6191 // Give the branch instruction our target
6192 SDValue Ops[] = {
6193 BR->getOperand(0),
6194 BRCOND.getOperand(2)
6195 };
6196 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6197 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6198 }
6199
6200 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6201
6202 // Copy the intrinsic results to registers
6203 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6205 if (!CopyToReg)
6206 continue;
6207
6208 Chain = DAG.getCopyToReg(
6209 Chain, DL,
6210 CopyToReg->getOperand(1),
6211 SDValue(Result, i - 1),
6212 SDValue());
6213
6214 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6215 }
6216
6217 // Remove the old intrinsic from the chain
6219 SDValue(Intr, Intr->getNumValues() - 1),
6220 Intr->getOperand(0));
6221
6222 return Chain;
6223}
6224
6225SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6226 SelectionDAG &DAG) const {
6227 MVT VT = Op.getSimpleValueType();
6228 SDLoc DL(Op);
6229 // Checking the depth
6230 if (Op.getConstantOperandVal(0) != 0)
6231 return DAG.getConstant(0, DL, VT);
6232
6235 // Check for kernel and shader functions
6236 if (Info->isEntryFunction())
6237 return DAG.getConstant(0, DL, VT);
6238
6239 MachineFrameInfo &MFI = MF.getFrameInfo();
6240 // There is a call to @llvm.returnaddress in this function
6241 MFI.setReturnAddressIsTaken(true);
6242
6244 // Get the return address reg and mark it as an implicit live-in
6245 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6246
6247 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6248}
6249
6250SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6251 SDValue Op,
6252 const SDLoc &DL,
6253 EVT VT) const {
6254 return Op.getValueType().bitsLE(VT) ?
6255 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6256 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6257 DAG.getTargetConstant(0, DL, MVT::i32));
6258}
6259
6260SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6261 assert(Op.getValueType() == MVT::f16 &&
6262 "Do not know how to custom lower FP_ROUND for non-f16 type");
6263
6264 SDValue Src = Op.getOperand(0);
6265 EVT SrcVT = Src.getValueType();
6266 if (SrcVT != MVT::f64)
6267 return Op;
6268
6269 // TODO: Handle strictfp
6270 if (Op.getOpcode() != ISD::FP_ROUND)
6271 return Op;
6272
6273 SDLoc DL(Op);
6274
6275 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6276 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6277 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6278}
6279
6280SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6281 SelectionDAG &DAG) const {
6282 EVT VT = Op.getValueType();
6283 const MachineFunction &MF = DAG.getMachineFunction();
6285 bool IsIEEEMode = Info->getMode().IEEE;
6286
6287 // FIXME: Assert during selection that this is only selected for
6288 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6289 // mode functions, but this happens to be OK since it's only done in cases
6290 // where there is known no sNaN.
6291 if (IsIEEEMode)
6292 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6293
6294 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6295 VT == MVT::v16f16)
6296 return splitBinaryVectorOp(Op, DAG);
6297 return Op;
6298}
6299
6300SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6301 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6302 EVT VT = Op.getValueType();
6303 assert(VT == MVT::f16);
6304
6305 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6306 EVT ExpVT = Exp.getValueType();
6307 if (ExpVT == MVT::i16)
6308 return Op;
6309
6310 SDLoc DL(Op);
6311
6312 // Correct the exponent type for f16 to i16.
6313 // Clamp the range of the exponent to the instruction's range.
6314
6315 // TODO: This should be a generic narrowing legalization, and can easily be
6316 // for GlobalISel.
6317
6318 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6319 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6320
6321 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6322 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6323
6324 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6325
6326 if (IsStrict) {
6327 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6328 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6329 }
6330
6331 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6332}
6333
6334// Custom lowering for vector multiplications and s_mul_u64.
6335SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6336 EVT VT = Op.getValueType();
6337
6338 // Split vector operands.
6339 if (VT.isVector())
6340 return splitBinaryVectorOp(Op, DAG);
6341
6342 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6343
6344 // There are four ways to lower s_mul_u64:
6345 //
6346 // 1. If all the operands are uniform, then we lower it as it is.
6347 //
6348 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6349 // multiplications because there is not a vector equivalent of s_mul_u64.
6350 //
6351 // 3. If the cost model decides that it is more efficient to use vector
6352 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6353 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6354 //
6355 // 4. If the cost model decides to use vector registers and both of the
6356 // operands are zero-extended/sign-extended from 32-bits, then we split the
6357 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6358 // possible to check if the operands are zero-extended or sign-extended in
6359 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6360 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6361 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6362 // If the cost model decides that we have to use vector registers, then
6363 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6364 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6365 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6366 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6367 // SIInstrInfo.cpp .
6368
6369 if (Op->isDivergent())
6370 return SDValue();
6371
6372 SDValue Op0 = Op.getOperand(0);
6373 SDValue Op1 = Op.getOperand(1);
6374 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6375 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6376 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6377 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6378 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6379 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6380 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6381 SDLoc SL(Op);
6382 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6383 return SDValue(
6384 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6385 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6386 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6387 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6388 return SDValue(
6389 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6390 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6391 return Op;
6392}
6393
6394SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6395 EVT VT = Op.getValueType();
6396 SDLoc SL(Op);
6397 SDValue LHS = Op.getOperand(0);
6398 SDValue RHS = Op.getOperand(1);
6399 bool isSigned = Op.getOpcode() == ISD::SMULO;
6400
6401 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6402 const APInt &C = RHSC->getAPIntValue();
6403 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6404 if (C.isPowerOf2()) {
6405 // smulo(x, signed_min) is same as umulo(x, signed_min).
6406 bool UseArithShift = isSigned && !C.isMinSignedValue();
6407 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6408 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6409 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6410 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6411 SL, VT, Result, ShiftAmt),
6412 LHS, ISD::SETNE);
6413 return DAG.getMergeValues({ Result, Overflow }, SL);
6414 }
6415 }
6416
6417 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6419 SL, VT, LHS, RHS);
6420
6421 SDValue Sign = isSigned
6422 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6423 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6424 : DAG.getConstant(0, SL, VT);
6425 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6426
6427 return DAG.getMergeValues({ Result, Overflow }, SL);
6428}
6429
6430SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6431 if (Op->isDivergent()) {
6432 // Select to V_MAD_[IU]64_[IU]32.
6433 return Op;
6434 }
6435 if (Subtarget->hasSMulHi()) {
6436 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6437 return SDValue();
6438 }
6439 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6440 // calculate the high part, so we might as well do the whole thing with
6441 // V_MAD_[IU]64_[IU]32.
6442 return Op;
6443}
6444
6445SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6446 if (!Subtarget->isTrapHandlerEnabled() ||
6448 return lowerTrapEndpgm(Op, DAG);
6449
6450 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6451 lowerTrapHsaQueuePtr(Op, DAG);
6452}
6453
6454SDValue SITargetLowering::lowerTrapEndpgm(
6455 SDValue Op, SelectionDAG &DAG) const {
6456 SDLoc SL(Op);
6457 SDValue Chain = Op.getOperand(0);
6458 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6459}
6460
6461SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6462 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6465 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6467 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6470}
6471
6472SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6473 SDValue Op, SelectionDAG &DAG) const {
6474 SDLoc SL(Op);
6475 SDValue Chain = Op.getOperand(0);
6476
6477 SDValue QueuePtr;
6478 // For code object version 5, QueuePtr is passed through implicit kernarg.
6479 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6481 QueuePtr =
6482 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6483 } else {
6486 Register UserSGPR = Info->getQueuePtrUserSGPR();
6487
6488 if (UserSGPR == AMDGPU::NoRegister) {
6489 // We probably are in a function incorrectly marked with
6490 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6491 // trap, so just use a null pointer.
6492 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6493 } else {
6494 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6495 MVT::i64);
6496 }
6497 }
6498
6499 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6500 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6501 QueuePtr, SDValue());
6502
6504 SDValue Ops[] = {
6505 ToReg,
6506 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6507 SGPR01,
6508 ToReg.getValue(1)
6509 };
6510 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6511}
6512
6513SDValue SITargetLowering::lowerTrapHsa(
6514 SDValue Op, SelectionDAG &DAG) const {
6515 SDLoc SL(Op);
6516 SDValue Chain = Op.getOperand(0);
6517
6519 SDValue Ops[] = {
6520 Chain,
6521 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6522 };
6523 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6524}
6525
6526SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6527 SDLoc SL(Op);
6528 SDValue Chain = Op.getOperand(0);
6530
6531 if (!Subtarget->isTrapHandlerEnabled() ||
6534 "debugtrap handler not supported",
6535 Op.getDebugLoc(),
6536 DS_Warning);
6537 LLVMContext &Ctx = MF.getFunction().getContext();
6538 Ctx.diagnose(NoTrap);
6539 return Chain;
6540 }
6541
6543 SDValue Ops[] = {
6544 Chain,
6545 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6546 };
6547 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6548}
6549
6550SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6551 SelectionDAG &DAG) const {
6552 if (Subtarget->hasApertureRegs()) {
6553 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6554 ? AMDGPU::SRC_SHARED_BASE
6555 : AMDGPU::SRC_PRIVATE_BASE;
6556 // Note: this feature (register) is broken. When used as a 32-bit operand,
6557 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6558 // bits.
6559 //
6560 // To work around the issue, directly emit a 64 bit mov from this register
6561 // then extract the high bits. Note that this shouldn't even result in a
6562 // shift being emitted and simply become a pair of registers (e.g.):
6563 // s_mov_b64 s[6:7], src_shared_base
6564 // v_mov_b32_e32 v1, s7
6565 //
6566 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6567 // coalescing would kick in and it would think it's okay to use the "HI"
6568 // subregister directly (instead of extracting the HI 32 bits) which is an
6569 // artificial (unusable) register.
6570 // Register TableGen definitions would need an overhaul to get rid of the
6571 // artificial "HI" aperture registers and prevent this kind of issue from
6572 // happening.
6573 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6574 DAG.getRegister(ApertureRegNo, MVT::i64));
6575 return DAG.getNode(
6576 ISD::TRUNCATE, DL, MVT::i32,
6577 DAG.getNode(ISD::SRL, DL, MVT::i64,
6578 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6579 }
6580
6581 // For code object version 5, private_base and shared_base are passed through
6582 // implicit kernargs.
6583 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6587 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6588 }
6589
6592 Register UserSGPR = Info->getQueuePtrUserSGPR();
6593 if (UserSGPR == AMDGPU::NoRegister) {
6594 // We probably are in a function incorrectly marked with
6595 // amdgpu-no-queue-ptr. This is undefined.
6596 return DAG.getUNDEF(MVT::i32);
6597 }
6598
6599 SDValue QueuePtr = CreateLiveInRegister(
6600 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6601
6602 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6603 // private_segment_aperture_base_hi.
6604 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6605
6606 SDValue Ptr =
6607 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6608
6609 // TODO: Use custom target PseudoSourceValue.
6610 // TODO: We should use the value from the IR intrinsic call, but it might not
6611 // be available and how do we get it?
6613 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6614 commonAlignment(Align(64), StructOffset),
6617}
6618
6619/// Return true if the value is a known valid address, such that a null check is
6620/// not necessary.
6622 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6623 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6624 isa<BasicBlockSDNode>(Val))
6625 return true;
6626
6627 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6628 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6629
6630 // TODO: Search through arithmetic, handle arguments and loads
6631 // marked nonnull.
6632 return false;
6633}
6634
6635SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6636 SelectionDAG &DAG) const {
6637 SDLoc SL(Op);
6638 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
6639
6640 SDValue Src = ASC->getOperand(0);
6641 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6642 unsigned SrcAS = ASC->getSrcAddressSpace();
6643
6644 const AMDGPUTargetMachine &TM =
6645 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6646
6647 // flat -> local/private
6648 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6649 unsigned DestAS = ASC->getDestAddressSpace();
6650
6651 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6652 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6653 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6654
6655 if (isKnownNonNull(Src, DAG, TM, SrcAS))
6656 return Ptr;
6657
6658 unsigned NullVal = TM.getNullPointerValue(DestAS);
6659 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6660 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6661
6662 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6663 SegmentNullPtr);
6664 }
6665 }
6666
6667 // local/private -> flat
6669 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6670 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6671
6672 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
6673 SDValue CvtPtr =
6674 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6675 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6676
6677 if (isKnownNonNull(Src, DAG, TM, SrcAS))
6678 return CvtPtr;
6679
6680 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6681 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6682
6683 SDValue NonNull
6684 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6685
6686 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6687 FlatNullPtr);
6688 }
6689 }
6690
6691 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6692 Op.getValueType() == MVT::i64) {
6695 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6696 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6697 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6698 }
6699
6701 Src.getValueType() == MVT::i64)
6702 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6703
6704 // global <-> flat are no-ops and never emitted.
6705
6706 const MachineFunction &MF = DAG.getMachineFunction();
6707 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6708 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6709 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6710
6711 return DAG.getUNDEF(ASC->getValueType(0));
6712}
6713
6714// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6715// the small vector and inserting them into the big vector. That is better than
6716// the default expansion of doing it via a stack slot. Even though the use of
6717// the stack slot would be optimized away afterwards, the stack slot itself
6718// remains.
6719SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6720 SelectionDAG &DAG) const {
6721 SDValue Vec = Op.getOperand(0);
6722 SDValue Ins = Op.getOperand(1);
6723 SDValue Idx = Op.getOperand(2);
6724 EVT VecVT = Vec.getValueType();
6725 EVT InsVT = Ins.getValueType();
6726 EVT EltVT = VecVT.getVectorElementType();
6727 unsigned InsNumElts = InsVT.getVectorNumElements();
6728 unsigned IdxVal = Idx->getAsZExtVal();
6729 SDLoc SL(Op);
6730
6731 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6732 // Insert 32-bit registers at a time.
6733 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6734
6735 unsigned VecNumElts = VecVT.getVectorNumElements();
6736 EVT NewVecVT =
6737 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6738 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6740 MVT::i32, InsNumElts / 2);
6741
6742 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6743 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6744
6745 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6746 SDValue Elt;
6747 if (InsNumElts == 2) {
6748 Elt = Ins;
6749 } else {
6750 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6751 DAG.getConstant(I, SL, MVT::i32));
6752 }
6753 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6754 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6755 }
6756
6757 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6758 }
6759
6760 for (unsigned I = 0; I != InsNumElts; ++I) {
6761 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6762 DAG.getConstant(I, SL, MVT::i32));
6763 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6764 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6765 }
6766 return Vec;
6767}
6768
6769SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6770 SelectionDAG &DAG) const {
6771 SDValue Vec = Op.getOperand(0);
6772 SDValue InsVal = Op.getOperand(1);
6773 SDValue Idx = Op.getOperand(2);
6774 EVT VecVT = Vec.getValueType();
6775 EVT EltVT = VecVT.getVectorElementType();
6776 unsigned VecSize = VecVT.getSizeInBits();
6777 unsigned EltSize = EltVT.getSizeInBits();
6778 SDLoc SL(Op);
6779
6780 // Specially handle the case of v4i16 with static indexing.
6781 unsigned NumElts = VecVT.getVectorNumElements();
6782 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
6783 if (NumElts == 4 && EltSize == 16 && KIdx) {
6784 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6785
6786 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6787 DAG.getConstant(0, SL, MVT::i32));
6788 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6789 DAG.getConstant(1, SL, MVT::i32));
6790
6791 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6792 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6793
6794 unsigned Idx = KIdx->getZExtValue();
6795 bool InsertLo = Idx < 2;
6796 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6797 InsertLo ? LoVec : HiVec,
6798 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6799 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6800
6801 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6802
6803 SDValue Concat = InsertLo ?
6804 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6805 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6806
6807 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
6808 }
6809
6810 // Static indexing does not lower to stack access, and hence there is no need
6811 // for special custom lowering to avoid stack access.
6812 if (isa<ConstantSDNode>(Idx))
6813 return SDValue();
6814
6815 // Avoid stack access for dynamic indexing by custom lowering to
6816 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6817
6818 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6819
6820 MVT IntVT = MVT::getIntegerVT(VecSize);
6821
6822 // Convert vector index to bit-index and get the required bit mask.
6823 assert(isPowerOf2_32(EltSize));
6824 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6825 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6826 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6827 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
6828 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
6829
6830 // 1. Create a congruent vector with the target value in each element.
6831 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
6832 DAG.getSplatBuildVector(VecVT, SL, InsVal));
6833
6834 // 2. Mask off all other indicies except the required index within (1).
6835 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
6836
6837 // 3. Mask off the required index within the target vector.
6838 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6839 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
6840 DAG.getNOT(SL, BFM, IntVT), BCVec);
6841
6842 // 4. Get (2) and (3) ORed into the target vector.
6843 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
6844
6845 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
6846}
6847
6848SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6849 SelectionDAG &DAG) const {
6850 SDLoc SL(Op);
6851
6852 EVT ResultVT = Op.getValueType();
6853 SDValue Vec = Op.getOperand(0);
6854 SDValue Idx = Op.getOperand(1);
6855 EVT VecVT = Vec.getValueType();
6856 unsigned VecSize = VecVT.getSizeInBits();
6857 EVT EltVT = VecVT.getVectorElementType();
6858
6859 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6860
6861 // Make sure we do any optimizations that will make it easier to fold
6862 // source modifiers before obscuring it with bit operations.
6863
6864 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6865 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
6866 return Combined;
6867
6868 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6869 SDValue Lo, Hi;
6870 EVT LoVT, HiVT;
6871 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
6872
6873 if (VecSize == 128) {
6874 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
6875 Lo = DAG.getBitcast(LoVT,
6876 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6877 DAG.getConstant(0, SL, MVT::i32)));
6878 Hi = DAG.getBitcast(HiVT,
6879 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6880 DAG.getConstant(1, SL, MVT::i32)));
6881 } else if (VecSize == 256) {
6882 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
6883 SDValue Parts[4];
6884 for (unsigned P = 0; P < 4; ++P) {
6885 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6886 DAG.getConstant(P, SL, MVT::i32));
6887 }
6888
6889 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
6890 Parts[0], Parts[1]));
6891 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
6892 Parts[2], Parts[3]));
6893 } else {
6894 assert(VecSize == 512);
6895
6896 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
6897 SDValue Parts[8];
6898 for (unsigned P = 0; P < 8; ++P) {
6899 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6900 DAG.getConstant(P, SL, MVT::i32));
6901 }
6902
6903 Lo = DAG.getBitcast(LoVT,
6904 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
6905 Parts[0], Parts[1], Parts[2], Parts[3]));
6906 Hi = DAG.getBitcast(HiVT,
6907 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
6908 Parts[4], Parts[5],Parts[6], Parts[7]));
6909 }
6910
6911 EVT IdxVT = Idx.getValueType();
6912 unsigned NElem = VecVT.getVectorNumElements();
6913 assert(isPowerOf2_32(NElem));
6914 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
6915 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
6916 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
6917 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
6918 }
6919
6920 assert(VecSize <= 64);
6921
6922 MVT IntVT = MVT::getIntegerVT(VecSize);
6923
6924 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
6925 SDValue VecBC = peekThroughBitcasts(Vec);
6926 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6927 SDValue Src = VecBC.getOperand(0);
6928 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
6929 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
6930 }
6931
6932 unsigned EltSize = EltVT.getSizeInBits();
6933 assert(isPowerOf2_32(EltSize));
6934
6935 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6936
6937 // Convert vector index to bit-index (* EltSize)
6938 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6939
6940 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6941 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
6942
6943 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
6944 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
6945 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
6946 }
6947
6948 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
6949}
6950
6951static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
6952 assert(Elt % 2 == 0);
6953 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
6954}
6955
6956SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
6957 SelectionDAG &DAG) const {
6958 SDLoc SL(Op);
6959 EVT ResultVT = Op.getValueType();
6960 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
6961
6962 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
6963 EVT EltVT = PackVT.getVectorElementType();
6964 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
6965
6966 // vector_shuffle <0,1,6,7> lhs, rhs
6967 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
6968 //
6969 // vector_shuffle <6,7,2,3> lhs, rhs
6970 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
6971 //
6972 // vector_shuffle <6,7,0,1> lhs, rhs
6973 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
6974
6975 // Avoid scalarizing when both halves are reading from consecutive elements.
6977 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
6978 if (elementPairIsContiguous(SVN->getMask(), I)) {
6979 const int Idx = SVN->getMaskElt(I);
6980 int VecIdx = Idx < SrcNumElts ? 0 : 1;
6981 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
6982 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
6983 PackVT, SVN->getOperand(VecIdx),
6984 DAG.getConstant(EltIdx, SL, MVT::i32));
6985 Pieces.push_back(SubVec);
6986 } else {
6987 const int Idx0 = SVN->getMaskElt(I);
6988 const int Idx1 = SVN->getMaskElt(I + 1);
6989 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
6990 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
6991 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
6992 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
6993
6994 SDValue Vec0 = SVN->getOperand(VecIdx0);
6995 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
6996 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
6997
6998 SDValue Vec1 = SVN->getOperand(VecIdx1);
6999 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7000 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7001 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7002 }
7003 }
7004
7005 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7006}
7007
7008SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7009 SelectionDAG &DAG) const {
7010 SDValue SVal = Op.getOperand(0);
7011 EVT ResultVT = Op.getValueType();
7012 EVT SValVT = SVal.getValueType();
7013 SDValue UndefVal = DAG.getUNDEF(SValVT);
7014 SDLoc SL(Op);
7015
7017 VElts.push_back(SVal);
7018 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7019 VElts.push_back(UndefVal);
7020
7021 return DAG.getBuildVector(ResultVT, SL, VElts);
7022}
7023
7024SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7025 SelectionDAG &DAG) const {
7026 SDLoc SL(Op);
7027 EVT VT = Op.getValueType();
7028
7029 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7030 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7032 VT.getVectorNumElements() / 2);
7033 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7034
7035 // Turn into pair of packed build_vectors.
7036 // TODO: Special case for constants that can be materialized with s_mov_b64.
7037 SmallVector<SDValue, 4> LoOps, HiOps;
7038 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7039 LoOps.push_back(Op.getOperand(I));
7040 HiOps.push_back(Op.getOperand(I + E));
7041 }
7042 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7043 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7044
7045 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7046 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7047
7048 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7049 { CastLo, CastHi });
7050 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7051 }
7052
7053 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7055 VT.getVectorNumElements() / 4);
7056 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7057
7058 SmallVector<SDValue, 4> Parts[4];
7059 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7060 for (unsigned P = 0; P < 4; ++P)
7061 Parts[P].push_back(Op.getOperand(I + P * E));
7062 }
7063 SDValue Casts[4];
7064 for (unsigned P = 0; P < 4; ++P) {
7065 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7066 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7067 }
7068
7069 SDValue Blend =
7070 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7071 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7072 }
7073
7074 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7076 VT.getVectorNumElements() / 8);
7077 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7078
7079 SmallVector<SDValue, 8> Parts[8];
7080 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7081 for (unsigned P = 0; P < 8; ++P)
7082 Parts[P].push_back(Op.getOperand(I + P * E));
7083 }
7084 SDValue Casts[8];
7085 for (unsigned P = 0; P < 8; ++P) {
7086 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7087 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7088 }
7089
7090 SDValue Blend =
7091 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7092 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7093 }
7094
7095 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7096 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7097
7098 SDValue Lo = Op.getOperand(0);
7099 SDValue Hi = Op.getOperand(1);
7100
7101 // Avoid adding defined bits with the zero_extend.
7102 if (Hi.isUndef()) {
7103 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7104 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7105 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7106 }
7107
7108 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7109 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7110
7111 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7112 DAG.getConstant(16, SL, MVT::i32));
7113 if (Lo.isUndef())
7114 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7115
7116 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7117 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7118
7119 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7120 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7121}
7122
7123bool
7125 // We can fold offsets for anything that doesn't require a GOT relocation.
7126 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7130}
7131
7132static SDValue
7134 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7135 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7136 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7137 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7138 // lowered to the following code sequence:
7139 //
7140 // For constant address space:
7141 // s_getpc_b64 s[0:1]
7142 // s_add_u32 s0, s0, $symbol
7143 // s_addc_u32 s1, s1, 0
7144 //
7145 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7146 // a fixup or relocation is emitted to replace $symbol with a literal
7147 // constant, which is a pc-relative offset from the encoding of the $symbol
7148 // operand to the global variable.
7149 //
7150 // For global address space:
7151 // s_getpc_b64 s[0:1]
7152 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7153 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7154 //
7155 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7156 // fixups or relocations are emitted to replace $symbol@*@lo and
7157 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7158 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7159 // operand to the global variable.
7160 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7161 SDValue PtrHi;
7162 if (GAFlags == SIInstrInfo::MO_NONE)
7163 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7164 else
7165 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7166 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7167}
7168
7169SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7170 SDValue Op,
7171 SelectionDAG &DAG) const {
7172 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7173 SDLoc DL(GSD);
7174 EVT PtrVT = Op.getValueType();
7175
7176 const GlobalValue *GV = GSD->getGlobal();
7182 GV->hasExternalLinkage()) {
7183 Type *Ty = GV->getValueType();
7184 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7185 // zero-sized type in other languages to declare the dynamic shared
7186 // memory which size is not known at the compile time. They will be
7187 // allocated by the runtime and placed directly after the static
7188 // allocated ones. They all share the same offset.
7189 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7190 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7191 // Adjust alignment for that dynamic shared memory array.
7193 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7194 MFI->setUsesDynamicLDS(true);
7195 return SDValue(
7196 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7197 }
7198 }
7200 }
7201
7203 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7205 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7206 }
7207
7208 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7209 SDValue AddrLo = DAG.getTargetGlobalAddress(
7210 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7211 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7212
7213 SDValue AddrHi = DAG.getTargetGlobalAddress(
7214 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7215 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7216
7217 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7218 }
7219
7220 if (shouldEmitFixup(GV))
7221 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7222
7223 if (shouldEmitPCReloc(GV))
7224 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7226
7227 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7229
7230 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7232 const DataLayout &DataLayout = DAG.getDataLayout();
7233 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7234 MachinePointerInfo PtrInfo
7236
7237 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7240}
7241
7243 const SDLoc &DL, SDValue V) const {
7244 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7245 // the destination register.
7246 //
7247 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7248 // so we will end up with redundant moves to m0.
7249 //
7250 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7251
7252 // A Null SDValue creates a glue result.
7253 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7254 V, Chain);
7255 return SDValue(M0, 0);
7256}
7257
7258SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7259 SDValue Op,
7260 MVT VT,
7261 unsigned Offset) const {
7262 SDLoc SL(Op);
7263 SDValue Param = lowerKernargMemParameter(
7264 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7265 // The local size values will have the hi 16-bits as zero.
7266 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7267 DAG.getValueType(VT));
7268}
7269
7271 EVT VT) {
7273 "non-hsa intrinsic with hsa target",
7274 DL.getDebugLoc());
7275 DAG.getContext()->diagnose(BadIntrin);
7276 return DAG.getUNDEF(VT);
7277}
7278
7280 EVT VT) {
7282 "intrinsic not supported on subtarget",
7283 DL.getDebugLoc());
7284 DAG.getContext()->diagnose(BadIntrin);
7285 return DAG.getUNDEF(VT);
7286}
7287
7289 ArrayRef<SDValue> Elts) {
7290 assert(!Elts.empty());
7291 MVT Type;
7292 unsigned NumElts = Elts.size();
7293
7294 if (NumElts <= 12) {
7295 Type = MVT::getVectorVT(MVT::f32, NumElts);
7296 } else {
7297 assert(Elts.size() <= 16);
7298 Type = MVT::v16f32;
7299 NumElts = 16;
7300 }
7301
7302 SmallVector<SDValue, 16> VecElts(NumElts);
7303 for (unsigned i = 0; i < Elts.size(); ++i) {
7304 SDValue Elt = Elts[i];
7305 if (Elt.getValueType() != MVT::f32)
7306 Elt = DAG.getBitcast(MVT::f32, Elt);
7307 VecElts[i] = Elt;
7308 }
7309 for (unsigned i = Elts.size(); i < NumElts; ++i)
7310 VecElts[i] = DAG.getUNDEF(MVT::f32);
7311
7312 if (NumElts == 1)
7313 return VecElts[0];
7314 return DAG.getBuildVector(Type, DL, VecElts);
7315}
7316
7317static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7318 SDValue Src, int ExtraElts) {
7319 EVT SrcVT = Src.getValueType();
7320
7322
7323 if (SrcVT.isVector())
7324 DAG.ExtractVectorElements(Src, Elts);
7325 else
7326 Elts.push_back(Src);
7327
7328 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7329 while (ExtraElts--)
7330 Elts.push_back(Undef);
7331
7332 return DAG.getBuildVector(CastVT, DL, Elts);
7333}
7334
7335// Re-construct the required return value for a image load intrinsic.
7336// This is more complicated due to the optional use TexFailCtrl which means the required
7337// return type is an aggregate
7339 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7340 bool Unpacked, bool IsD16, int DMaskPop,
7341 int NumVDataDwords, bool IsAtomicPacked16Bit,
7342 const SDLoc &DL) {
7343 // Determine the required return type. This is the same regardless of IsTexFail flag
7344 EVT ReqRetVT = ResultTypes[0];
7345 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7346 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7347 ? (ReqRetNumElts + 1) / 2
7348 : ReqRetNumElts;
7349
7350 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7351 DMaskPop : (DMaskPop + 1) / 2;
7352
7353 MVT DataDwordVT = NumDataDwords == 1 ?
7354 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7355
7356 MVT MaskPopVT = MaskPopDwords == 1 ?
7357 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7358
7359 SDValue Data(Result, 0);
7360 SDValue TexFail;
7361
7362 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7363 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7364 if (MaskPopVT.isVector()) {
7365 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7366 SDValue(Result, 0), ZeroIdx);
7367 } else {
7368 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7369 SDValue(Result, 0), ZeroIdx);
7370 }
7371 }
7372
7373 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7374 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7375 NumDataDwords - MaskPopDwords);
7376
7377 if (IsD16)
7378 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7379
7380 EVT LegalReqRetVT = ReqRetVT;
7381 if (!ReqRetVT.isVector()) {
7382 if (!Data.getValueType().isInteger())
7383 Data = DAG.getNode(ISD::BITCAST, DL,
7384 Data.getValueType().changeTypeToInteger(), Data);
7385 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7386 } else {
7387 // We need to widen the return vector to a legal type
7388 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7389 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7390 LegalReqRetVT =
7392 ReqRetVT.getVectorNumElements() + 1);
7393 }
7394 }
7395 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7396
7397 if (IsTexFail) {
7398 TexFail =
7399 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7400 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7401
7402 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7403 }
7404
7405 if (Result->getNumValues() == 1)
7406 return Data;
7407
7408 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7409}
7410
7411static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7412 SDValue *LWE, bool &IsTexFail) {
7413 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7414
7415 uint64_t Value = TexFailCtrlConst->getZExtValue();
7416 if (Value) {
7417 IsTexFail = true;
7418 }
7419
7420 SDLoc DL(TexFailCtrlConst);
7421 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7422 Value &= ~(uint64_t)0x1;
7423 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7424 Value &= ~(uint64_t)0x2;
7425
7426 return Value == 0;
7427}
7428
7430 MVT PackVectorVT,
7431 SmallVectorImpl<SDValue> &PackedAddrs,
7432 unsigned DimIdx, unsigned EndIdx,
7433 unsigned NumGradients) {
7434 SDLoc DL(Op);
7435 for (unsigned I = DimIdx; I < EndIdx; I++) {
7436 SDValue Addr = Op.getOperand(I);
7437
7438 // Gradients are packed with undef for each coordinate.
7439 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7440 // 1D: undef,dx/dh; undef,dx/dv
7441 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7442 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7443 if (((I + 1) >= EndIdx) ||
7444 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7445 I == DimIdx + NumGradients - 1))) {
7446 if (Addr.getValueType() != MVT::i16)
7447 Addr = DAG.getBitcast(MVT::i16, Addr);
7448 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7449 } else {
7450 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7451 I++;
7452 }
7453 Addr = DAG.getBitcast(MVT::f32, Addr);
7454 PackedAddrs.push_back(Addr);
7455 }
7456}
7457
7458SDValue SITargetLowering::lowerImage(SDValue Op,
7460 SelectionDAG &DAG, bool WithChain) const {
7461 SDLoc DL(Op);
7463 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7464 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7466 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7467 unsigned IntrOpcode = Intr->BaseOpcode;
7468 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7469 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7470 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7471
7472 SmallVector<EVT, 3> ResultTypes(Op->values());
7473 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7474 bool IsD16 = false;
7475 bool IsG16 = false;
7476 bool IsA16 = false;
7477 SDValue VData;
7478 int NumVDataDwords;
7479 bool AdjustRetType = false;
7480 bool IsAtomicPacked16Bit = false;
7481
7482 // Offset of intrinsic arguments
7483 const unsigned ArgOffset = WithChain ? 2 : 1;
7484
7485 unsigned DMask;
7486 unsigned DMaskLanes = 0;
7487
7488 if (BaseOpcode->Atomic) {
7489 VData = Op.getOperand(2);
7490
7491 IsAtomicPacked16Bit =
7492 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7493 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7494
7495 bool Is64Bit = VData.getValueSizeInBits() == 64;
7496 if (BaseOpcode->AtomicX2) {
7497 SDValue VData2 = Op.getOperand(3);
7498 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7499 {VData, VData2});
7500 if (Is64Bit)
7501 VData = DAG.getBitcast(MVT::v4i32, VData);
7502
7503 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7504 DMask = Is64Bit ? 0xf : 0x3;
7505 NumVDataDwords = Is64Bit ? 4 : 2;
7506 } else {
7507 DMask = Is64Bit ? 0x3 : 0x1;
7508 NumVDataDwords = Is64Bit ? 2 : 1;
7509 }
7510 } else {
7511 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7512 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7513
7514 if (BaseOpcode->Store) {
7515 VData = Op.getOperand(2);
7516
7517 MVT StoreVT = VData.getSimpleValueType();
7518 if (StoreVT.getScalarType() == MVT::f16) {
7519 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7520 return Op; // D16 is unsupported for this instruction
7521
7522 IsD16 = true;
7523 VData = handleD16VData(VData, DAG, true);
7524 }
7525
7526 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7527 } else {
7528 // Work out the num dwords based on the dmask popcount and underlying type
7529 // and whether packing is supported.
7530 MVT LoadVT = ResultTypes[0].getSimpleVT();
7531 if (LoadVT.getScalarType() == MVT::f16) {
7532 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7533 return Op; // D16 is unsupported for this instruction
7534
7535 IsD16 = true;
7536 }
7537
7538 // Confirm that the return type is large enough for the dmask specified
7539 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7540 (!LoadVT.isVector() && DMaskLanes > 1))
7541 return Op;
7542
7543 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7544 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7545 // instructions.
7546 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7547 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7548 NumVDataDwords = (DMaskLanes + 1) / 2;
7549 else
7550 NumVDataDwords = DMaskLanes;
7551
7552 AdjustRetType = true;
7553 }
7554 }
7555
7556 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7558
7559 // Check for 16 bit addresses or derivatives and pack if true.
7560 MVT VAddrVT =
7561 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7562 MVT VAddrScalarVT = VAddrVT.getScalarType();
7563 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7564 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7565
7566 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7567 VAddrScalarVT = VAddrVT.getScalarType();
7568 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7569 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7570
7571 // Push back extra arguments.
7572 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7573 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7574 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7575 // Special handling of bias when A16 is on. Bias is of type half but
7576 // occupies full 32-bit.
7577 SDValue Bias = DAG.getBuildVector(
7578 MVT::v2f16, DL,
7579 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7580 VAddrs.push_back(Bias);
7581 } else {
7582 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7583 "Bias needs to be converted to 16 bit in A16 mode");
7584 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7585 }
7586 }
7587
7588 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7589 // 16 bit gradients are supported, but are tied to the A16 control
7590 // so both gradients and addresses must be 16 bit
7591 LLVM_DEBUG(
7592 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7593 "require 16 bit args for both gradients and addresses");
7594 return Op;
7595 }
7596
7597 if (IsA16) {
7598 if (!ST->hasA16()) {
7599 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7600 "support 16 bit addresses\n");
7601 return Op;
7602 }
7603 }
7604
7605 // We've dealt with incorrect input so we know that if IsA16, IsG16
7606 // are set then we have to compress/pack operands (either address,
7607 // gradient or both)
7608 // In the case where a16 and gradients are tied (no G16 support) then we
7609 // have already verified that both IsA16 and IsG16 are true
7610 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7611 // Activate g16
7612 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7614 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7615 }
7616
7617 // Add gradients (packed or unpacked)
7618 if (IsG16) {
7619 // Pack the gradients
7620 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7621 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7622 ArgOffset + Intr->GradientStart,
7623 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7624 } else {
7625 for (unsigned I = ArgOffset + Intr->GradientStart;
7626 I < ArgOffset + Intr->CoordStart; I++)
7627 VAddrs.push_back(Op.getOperand(I));
7628 }
7629
7630 // Add addresses (packed or unpacked)
7631 if (IsA16) {
7632 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7633 ArgOffset + Intr->CoordStart, VAddrEnd,
7634 0 /* No gradients */);
7635 } else {
7636 // Add uncompressed address
7637 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7638 VAddrs.push_back(Op.getOperand(I));
7639 }
7640
7641 // If the register allocator cannot place the address registers contiguously
7642 // without introducing moves, then using the non-sequential address encoding
7643 // is always preferable, since it saves VALU instructions and is usually a
7644 // wash in terms of code size or even better.
7645 //
7646 // However, we currently have no way of hinting to the register allocator that
7647 // MIMG addresses should be placed contiguously when it is possible to do so,
7648 // so force non-NSA for the common 2-address case as a heuristic.
7649 //
7650 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7651 // allocation when possible.
7652 //
7653 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7654 // set of the remaining addresses.
7655 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7656 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7657 const bool UseNSA = ST->hasNSAEncoding() &&
7658 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7659 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7660 const bool UsePartialNSA =
7661 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7662
7663 SDValue VAddr;
7664 if (UsePartialNSA) {
7665 VAddr = getBuildDwordsVector(DAG, DL,
7666 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7667 }
7668 else if (!UseNSA) {
7669 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7670 }
7671
7672 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7673 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7674 SDValue Unorm;
7675 if (!BaseOpcode->Sampler) {
7676 Unorm = True;
7677 } else {
7678 auto UnormConst =
7679 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
7680
7681 Unorm = UnormConst->getZExtValue() ? True : False;
7682 }
7683
7684 SDValue TFE;
7685 SDValue LWE;
7686 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7687 bool IsTexFail = false;
7688 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7689 return Op;
7690
7691 if (IsTexFail) {
7692 if (!DMaskLanes) {
7693 // Expecting to get an error flag since TFC is on - and dmask is 0
7694 // Force dmask to be at least 1 otherwise the instruction will fail
7695 DMask = 0x1;
7696 DMaskLanes = 1;
7697 NumVDataDwords = 1;
7698 }
7699 NumVDataDwords += 1;
7700 AdjustRetType = true;
7701 }
7702
7703 // Has something earlier tagged that the return type needs adjusting
7704 // This happens if the instruction is a load or has set TexFailCtrl flags
7705 if (AdjustRetType) {
7706 // NumVDataDwords reflects the true number of dwords required in the return type
7707 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7708 // This is a no-op load. This can be eliminated
7709 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7710 if (isa<MemSDNode>(Op))
7711 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7712 return Undef;
7713 }
7714
7715 EVT NewVT = NumVDataDwords > 1 ?
7716 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7717 : MVT::i32;
7718
7719 ResultTypes[0] = NewVT;
7720 if (ResultTypes.size() == 3) {
7721 // Original result was aggregate type used for TexFailCtrl results
7722 // The actual instruction returns as a vector type which has now been
7723 // created. Remove the aggregate result.
7724 ResultTypes.erase(&ResultTypes[1]);
7725 }
7726 }
7727
7728 unsigned CPol = cast<ConstantSDNode>(
7729 Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
7730 if (BaseOpcode->Atomic)
7731 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7732 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7734 return Op;
7735
7737 if (BaseOpcode->Store || BaseOpcode->Atomic)
7738 Ops.push_back(VData); // vdata
7739 if (UsePartialNSA) {
7740 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7741 Ops.push_back(VAddr);
7742 }
7743 else if (UseNSA)
7744 append_range(Ops, VAddrs);
7745 else
7746 Ops.push_back(VAddr);
7747 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7748 if (BaseOpcode->Sampler)
7749 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7750 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7751 if (IsGFX10Plus)
7752 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7753 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7754 Ops.push_back(Unorm);
7755 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7756 Ops.push_back(IsA16 && // r128, a16 for gfx9
7757 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7758 if (IsGFX10Plus)
7759 Ops.push_back(IsA16 ? True : False);
7760 if (!Subtarget->hasGFX90AInsts()) {
7761 Ops.push_back(TFE); //tfe
7762 } else if (TFE->getAsZExtVal()) {
7763 report_fatal_error("TFE is not supported on this GPU");
7764 }
7765 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7766 Ops.push_back(LWE); // lwe
7767 if (!IsGFX10Plus)
7768 Ops.push_back(DimInfo->DA ? True : False);
7769 if (BaseOpcode->HasD16)
7770 Ops.push_back(IsD16 ? True : False);
7771 if (isa<MemSDNode>(Op))
7772 Ops.push_back(Op.getOperand(0)); // chain
7773
7774 int NumVAddrDwords =
7775 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7776 int Opcode = -1;
7777
7778 if (IsGFX12Plus) {
7779 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
7780 NumVDataDwords, NumVAddrDwords);
7781 } else if (IsGFX11Plus) {
7782 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7783 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7784 : AMDGPU::MIMGEncGfx11Default,
7785 NumVDataDwords, NumVAddrDwords);
7786 } else if (IsGFX10Plus) {
7787 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7788 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7789 : AMDGPU::MIMGEncGfx10Default,
7790 NumVDataDwords, NumVAddrDwords);
7791 } else {
7792 if (Subtarget->hasGFX90AInsts()) {
7793 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7794 NumVDataDwords, NumVAddrDwords);
7795 if (Opcode == -1)
7797 "requested image instruction is not supported on this GPU");
7798 }
7799 if (Opcode == -1 &&
7801 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7802 NumVDataDwords, NumVAddrDwords);
7803 if (Opcode == -1)
7804 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7805 NumVDataDwords, NumVAddrDwords);
7806 }
7807 if (Opcode == -1)
7808 return Op;
7809
7810 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
7811 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
7812 MachineMemOperand *MemRef = MemOp->getMemOperand();
7813 DAG.setNodeMemRefs(NewNode, {MemRef});
7814 }
7815
7816 if (BaseOpcode->AtomicX2) {
7818 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
7819 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
7820 }
7821 if (BaseOpcode->Store)
7822 return SDValue(NewNode, 0);
7823 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
7824 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
7825 NumVDataDwords, IsAtomicPacked16Bit, DL);
7826}
7827
7828SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7829 SDValue Offset, SDValue CachePolicy,
7830 SelectionDAG &DAG) const {
7832
7833 const DataLayout &DataLayout = DAG.getDataLayout();
7834 Align Alignment =
7836
7841 VT.getStoreSize(), Alignment);
7842
7843 if (!Offset->isDivergent()) {
7844 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
7845
7846 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
7847 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
7848 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
7849 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
7850 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7851 SDValue BufferLoad =
7853 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7854 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7855 }
7856
7857 // Widen vec3 load to vec4.
7858 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
7859 !Subtarget->hasScalarDwordx3Loads()) {
7860 EVT WidenedVT =
7862 auto WidenedOp = DAG.getMemIntrinsicNode(
7863 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
7864 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
7865 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
7866 DAG.getVectorIdxConstant(0, DL));
7867 return Subvector;
7868 }
7869
7871 DAG.getVTList(VT), Ops, VT, MMO);
7872 }
7873
7874 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
7875 // assume that the buffer is unswizzled.
7876 SDValue Ops[] = {
7877 DAG.getEntryNode(), // Chain
7878 Rsrc, // rsrc
7879 DAG.getConstant(0, DL, MVT::i32), // vindex
7880 {}, // voffset
7881 {}, // soffset
7882 {}, // offset
7883 CachePolicy, // cachepolicy
7884 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7885 };
7886 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7887 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7888 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7889 }
7890
7892 unsigned NumLoads = 1;
7893 MVT LoadVT = VT.getSimpleVT();
7894 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
7895 assert((LoadVT.getScalarType() == MVT::i32 ||
7896 LoadVT.getScalarType() == MVT::f32));
7897
7898 if (NumElts == 8 || NumElts == 16) {
7899 NumLoads = NumElts / 4;
7900 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
7901 }
7902
7903 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
7904
7905 // Use the alignment to ensure that the required offsets will fit into the
7906 // immediate offsets.
7907 setBufferOffsets(Offset, DAG, &Ops[3],
7908 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
7909
7910 uint64_t InstOffset = Ops[5]->getAsZExtVal();
7911 for (unsigned i = 0; i < NumLoads; ++i) {
7912 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
7913 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
7914 LoadVT, MMO, DAG));
7915 }
7916
7917 if (NumElts == 8 || NumElts == 16)
7918 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
7919
7920 return Loads[0];
7921}
7922
7923SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
7924 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7925 if (!Subtarget->hasArchitectedSGPRs())
7926 return {};
7927 SDLoc SL(Op);
7928 MVT VT = MVT::i32;
7929 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
7930 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
7931 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
7932}
7933
7934SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
7935 unsigned Dim,
7936 const ArgDescriptor &Arg) const {
7937 SDLoc SL(Op);
7939 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
7940 if (MaxID == 0)
7941 return DAG.getConstant(0, SL, MVT::i32);
7942
7943 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
7944 SDLoc(DAG.getEntryNode()), Arg);
7945
7946 // Don't bother inserting AssertZext for packed IDs since we're emitting the
7947 // masking operations anyway.
7948 //
7949 // TODO: We could assert the top bit is 0 for the source copy.
7950 if (Arg.isMasked())
7951 return Val;
7952
7953 // Preserve the known bits after expansion to a copy.
7955 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
7956 DAG.getValueType(SmallVT));
7957}
7958
7959SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
7960 SelectionDAG &DAG) const {
7962 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
7963
7964 EVT VT = Op.getValueType();
7965 SDLoc DL(Op);
7966 unsigned IntrinsicID = Op.getConstantOperandVal(0);
7967
7968 // TODO: Should this propagate fast-math-flags?
7969
7970 switch (IntrinsicID) {
7971 case Intrinsic::amdgcn_implicit_buffer_ptr: {
7972 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
7973 return emitNonHSAIntrinsicError(DAG, DL, VT);
7974 return getPreloadedValue(DAG, *MFI, VT,
7976 }
7977 case Intrinsic::amdgcn_dispatch_ptr:
7978 case Intrinsic::amdgcn_queue_ptr: {
7979 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
7980 DiagnosticInfoUnsupported BadIntrin(
7981 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
7982 DL.getDebugLoc());
7983 DAG.getContext()->diagnose(BadIntrin);
7984 return DAG.getUNDEF(VT);
7985 }
7986
7987 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
7989 return getPreloadedValue(DAG, *MFI, VT, RegID);
7990 }
7991 case Intrinsic::amdgcn_implicitarg_ptr: {
7992 if (MFI->isEntryFunction())
7993 return getImplicitArgPtr(DAG, DL);
7994 return getPreloadedValue(DAG, *MFI, VT,
7996 }
7997 case Intrinsic::amdgcn_kernarg_segment_ptr: {
7999 // This only makes sense to call in a kernel, so just lower to null.
8000 return DAG.getConstant(0, DL, VT);
8001 }
8002
8003 return getPreloadedValue(DAG, *MFI, VT,
8005 }
8006 case Intrinsic::amdgcn_dispatch_id: {
8007 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8008 }
8009 case Intrinsic::amdgcn_rcp:
8010 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8011 case Intrinsic::amdgcn_rsq:
8012 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8013 case Intrinsic::amdgcn_rsq_legacy:
8015 return emitRemovedIntrinsicError(DAG, DL, VT);
8016 return SDValue();
8017 case Intrinsic::amdgcn_rcp_legacy:
8019 return emitRemovedIntrinsicError(DAG, DL, VT);
8020 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8021 case Intrinsic::amdgcn_rsq_clamp: {
8023 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8024
8025 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8028
8029 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8030 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8031 DAG.getConstantFP(Max, DL, VT));
8032 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8033 DAG.getConstantFP(Min, DL, VT));
8034 }
8035 case Intrinsic::r600_read_ngroups_x:
8036 if (Subtarget->isAmdHsaOS())
8037 return emitNonHSAIntrinsicError(DAG, DL, VT);
8038
8039 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8041 false);
8042 case Intrinsic::r600_read_ngroups_y:
8043 if (Subtarget->isAmdHsaOS())
8044 return emitNonHSAIntrinsicError(DAG, DL, VT);
8045
8046 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8048 false);
8049 case Intrinsic::r600_read_ngroups_z:
8050 if (Subtarget->isAmdHsaOS())
8051 return emitNonHSAIntrinsicError(DAG, DL, VT);
8052
8053 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8055 false);
8056 case Intrinsic::r600_read_global_size_x:
8057 if (Subtarget->isAmdHsaOS())
8058 return emitNonHSAIntrinsicError(DAG, DL, VT);
8059
8060 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8062 Align(4), false);
8063 case Intrinsic::r600_read_global_size_y:
8064 if (Subtarget->isAmdHsaOS())
8065 return emitNonHSAIntrinsicError(DAG, DL, VT);
8066
8067 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8069 Align(4), false);
8070 case Intrinsic::r600_read_global_size_z:
8071 if (Subtarget->isAmdHsaOS())
8072 return emitNonHSAIntrinsicError(DAG, DL, VT);
8073
8074 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8076 Align(4), false);
8077 case Intrinsic::r600_read_local_size_x:
8078 if (Subtarget->isAmdHsaOS())
8079 return emitNonHSAIntrinsicError(DAG, DL, VT);
8080
8081 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8083 case Intrinsic::r600_read_local_size_y:
8084 if (Subtarget->isAmdHsaOS())
8085 return emitNonHSAIntrinsicError(DAG, DL, VT);
8086
8087 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8089 case Intrinsic::r600_read_local_size_z:
8090 if (Subtarget->isAmdHsaOS())
8091 return emitNonHSAIntrinsicError(DAG, DL, VT);
8092
8093 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8095 case Intrinsic::amdgcn_workgroup_id_x:
8096 return getPreloadedValue(DAG, *MFI, VT,
8098 case Intrinsic::amdgcn_workgroup_id_y:
8099 return getPreloadedValue(DAG, *MFI, VT,
8101 case Intrinsic::amdgcn_workgroup_id_z:
8102 return getPreloadedValue(DAG, *MFI, VT,
8104 case Intrinsic::amdgcn_wave_id:
8105 return lowerWaveID(DAG, Op);
8106 case Intrinsic::amdgcn_lds_kernel_id: {
8107 if (MFI->isEntryFunction())
8108 return getLDSKernelId(DAG, DL);
8109 return getPreloadedValue(DAG, *MFI, VT,
8111 }
8112 case Intrinsic::amdgcn_workitem_id_x:
8113 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8114 case Intrinsic::amdgcn_workitem_id_y:
8115 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8116 case Intrinsic::amdgcn_workitem_id_z:
8117 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8118 case Intrinsic::amdgcn_wavefrontsize:
8120 SDLoc(Op), MVT::i32);
8121 case Intrinsic::amdgcn_s_buffer_load: {
8122 unsigned CPol = Op.getConstantOperandVal(3);
8123 // s_buffer_load, because of how it's optimized, can't be volatile
8124 // so reject ones with the volatile bit set.
8125 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8128 return Op;
8129 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8130 DAG);
8131 }
8132 case Intrinsic::amdgcn_fdiv_fast:
8133 return lowerFDIV_FAST(Op, DAG);
8134 case Intrinsic::amdgcn_sin:
8135 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8136
8137 case Intrinsic::amdgcn_cos:
8138 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8139
8140 case Intrinsic::amdgcn_mul_u24:
8141 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8142 case Intrinsic::amdgcn_mul_i24:
8143 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8144
8145 case Intrinsic::amdgcn_log_clamp: {
8147 return SDValue();
8148
8149 return emitRemovedIntrinsicError(DAG, DL, VT);
8150 }
8151 case Intrinsic::amdgcn_fract:
8152 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8153
8154 case Intrinsic::amdgcn_class:
8155 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8156 Op.getOperand(1), Op.getOperand(2));
8157 case Intrinsic::amdgcn_div_fmas:
8158 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8159 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8160 Op.getOperand(4));
8161
8162 case Intrinsic::amdgcn_div_fixup:
8163 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8164 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8165
8166 case Intrinsic::amdgcn_div_scale: {
8167 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8168
8169 // Translate to the operands expected by the machine instruction. The
8170 // first parameter must be the same as the first instruction.
8171 SDValue Numerator = Op.getOperand(1);
8172 SDValue Denominator = Op.getOperand(2);
8173
8174 // Note this order is opposite of the machine instruction's operations,
8175 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8176 // intrinsic has the numerator as the first operand to match a normal
8177 // division operation.
8178
8179 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8180
8181 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8182 Denominator, Numerator);
8183 }
8184 case Intrinsic::amdgcn_icmp: {
8185 // There is a Pat that handles this variant, so return it as-is.
8186 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8187 Op.getConstantOperandVal(2) == 0 &&
8188 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8189 return Op;
8190 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8191 }
8192 case Intrinsic::amdgcn_fcmp: {
8193 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8194 }
8195 case Intrinsic::amdgcn_ballot:
8196 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8197 case Intrinsic::amdgcn_fmed3:
8198 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8199 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8200 case Intrinsic::amdgcn_fdot2:
8201 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8202 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8203 Op.getOperand(4));
8204 case Intrinsic::amdgcn_fmul_legacy:
8205 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8206 Op.getOperand(1), Op.getOperand(2));
8207 case Intrinsic::amdgcn_sffbh:
8208 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8209 case Intrinsic::amdgcn_sbfe:
8210 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8211 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8212 case Intrinsic::amdgcn_ubfe:
8213 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8214 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8215 case Intrinsic::amdgcn_cvt_pkrtz:
8216 case Intrinsic::amdgcn_cvt_pknorm_i16:
8217 case Intrinsic::amdgcn_cvt_pknorm_u16:
8218 case Intrinsic::amdgcn_cvt_pk_i16:
8219 case Intrinsic::amdgcn_cvt_pk_u16: {
8220 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8221 EVT VT = Op.getValueType();
8222 unsigned Opcode;
8223
8224 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8226 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8228 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8230 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8232 else
8234
8235 if (isTypeLegal(VT))
8236 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8237
8238 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8239 Op.getOperand(1), Op.getOperand(2));
8240 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8241 }
8242 case Intrinsic::amdgcn_fmad_ftz:
8243 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8244 Op.getOperand(2), Op.getOperand(3));
8245
8246 case Intrinsic::amdgcn_if_break:
8247 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8248 Op->getOperand(1), Op->getOperand(2)), 0);
8249
8250 case Intrinsic::amdgcn_groupstaticsize: {
8252 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8253 return Op;
8254
8255 const Module *M = MF.getFunction().getParent();
8256 const GlobalValue *GV =
8257 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8258 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8260 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8261 }
8262 case Intrinsic::amdgcn_is_shared:
8263 case Intrinsic::amdgcn_is_private: {
8264 SDLoc SL(Op);
8265 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8267 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8268 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8269 Op.getOperand(1));
8270
8271 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8272 DAG.getConstant(1, SL, MVT::i32));
8273 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8274 }
8275 case Intrinsic::amdgcn_perm:
8276 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8277 Op.getOperand(2), Op.getOperand(3));
8278 case Intrinsic::amdgcn_reloc_constant: {
8279 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8280 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8281 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8282 auto RelocSymbol = cast<GlobalVariable>(
8283 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8284 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8286 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8287 }
8288 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8289 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8290 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8291 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8292 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8293 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8294 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8295 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8296 if (Op.getOperand(4).getValueType() == MVT::i32)
8297 return SDValue();
8298
8299 SDLoc SL(Op);
8300 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8301 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8302 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8303 Op.getOperand(3), IndexKeyi32);
8304 }
8305 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8306 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8307 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8308 if (Op.getOperand(6).getValueType() == MVT::i32)
8309 return SDValue();
8310
8311 SDLoc SL(Op);
8312 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8313 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8314 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8315 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8316 IndexKeyi32, Op.getOperand(7)});
8317 }
8318 default:
8319 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8321 return lowerImage(Op, ImageDimIntr, DAG, false);
8322
8323 return Op;
8324 }
8325}
8326
8327// On targets not supporting constant in soffset field, turn zero to
8328// SGPR_NULL to avoid generating an extra s_mov with zero.
8330 const GCNSubtarget *Subtarget) {
8331 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8332 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8333 return SOffset;
8334}
8335
8336SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8337 SelectionDAG &DAG,
8338 unsigned NewOpcode) const {
8339 SDLoc DL(Op);
8340
8341 SDValue VData = Op.getOperand(2);
8342 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8343 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8344 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8345 SDValue Ops[] = {
8346 Op.getOperand(0), // Chain
8347 VData, // vdata
8348 Rsrc, // rsrc
8349 DAG.getConstant(0, DL, MVT::i32), // vindex
8350 Offsets.first, // voffset
8351 SOffset, // soffset
8352 Offsets.second, // offset
8353 Op.getOperand(6), // cachepolicy
8354 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8355 };
8356
8357 auto *M = cast<MemSDNode>(Op);
8358
8359 EVT MemVT = VData.getValueType();
8360 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8361 M->getMemOperand());
8362}
8363
8364// Return a value to use for the idxen operand by examining the vindex operand.
8365static unsigned getIdxEn(SDValue VIndex) {
8366 // No need to set idxen if vindex is known to be zero.
8367 return isNullConstant(VIndex) ? 0 : 1;
8368}
8369
8370SDValue
8371SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8372 unsigned NewOpcode) const {
8373 SDLoc DL(Op);
8374
8375 SDValue VData = Op.getOperand(2);
8376 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8377 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8378 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8379 SDValue Ops[] = {
8380 Op.getOperand(0), // Chain
8381 VData, // vdata
8382 Rsrc, // rsrc
8383 Op.getOperand(4), // vindex
8384 Offsets.first, // voffset
8385 SOffset, // soffset
8386 Offsets.second, // offset
8387 Op.getOperand(7), // cachepolicy
8388 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8389 };
8390
8391 auto *M = cast<MemSDNode>(Op);
8392
8393 EVT MemVT = VData.getValueType();
8394 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8395 M->getMemOperand());
8396}
8397
8398SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8399 SelectionDAG &DAG) const {
8400 unsigned IntrID = Op.getConstantOperandVal(1);
8401 SDLoc DL(Op);
8402
8403 switch (IntrID) {
8404 case Intrinsic::amdgcn_ds_ordered_add:
8405 case Intrinsic::amdgcn_ds_ordered_swap: {
8406 MemSDNode *M = cast<MemSDNode>(Op);
8407 SDValue Chain = M->getOperand(0);
8408 SDValue M0 = M->getOperand(2);
8409 SDValue Value = M->getOperand(3);
8410 unsigned IndexOperand = M->getConstantOperandVal(7);
8411 unsigned WaveRelease = M->getConstantOperandVal(8);
8412 unsigned WaveDone = M->getConstantOperandVal(9);
8413
8414 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8415 IndexOperand &= ~0x3f;
8416 unsigned CountDw = 0;
8417
8418 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8419 CountDw = (IndexOperand >> 24) & 0xf;
8420 IndexOperand &= ~(0xf << 24);
8421
8422 if (CountDw < 1 || CountDw > 4) {
8424 "ds_ordered_count: dword count must be between 1 and 4");
8425 }
8426 }
8427
8428 if (IndexOperand)
8429 report_fatal_error("ds_ordered_count: bad index operand");
8430
8431 if (WaveDone && !WaveRelease)
8432 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8433
8434 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8435 unsigned ShaderType =
8437 unsigned Offset0 = OrderedCountIndex << 2;
8438 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8439
8440 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8441 Offset1 |= (CountDw - 1) << 6;
8442
8443 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8444 Offset1 |= ShaderType << 2;
8445
8446 unsigned Offset = Offset0 | (Offset1 << 8);
8447
8448 SDValue Ops[] = {
8449 Chain,
8450 Value,
8451 DAG.getTargetConstant(Offset, DL, MVT::i16),
8452 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8453 };
8455 M->getVTList(), Ops, M->getMemoryVT(),
8456 M->getMemOperand());
8457 }
8458 case Intrinsic::amdgcn_ds_fadd: {
8459 MemSDNode *M = cast<MemSDNode>(Op);
8460 unsigned Opc;
8461 switch (IntrID) {
8462 case Intrinsic::amdgcn_ds_fadd:
8464 break;
8465 }
8466
8467 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8468 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8469 M->getMemOperand());
8470 }
8471 case Intrinsic::amdgcn_ds_fmin:
8472 case Intrinsic::amdgcn_ds_fmax: {
8473 MemSDNode *M = cast<MemSDNode>(Op);
8474 unsigned Opc;
8475 switch (IntrID) {
8476 case Intrinsic::amdgcn_ds_fmin:
8478 break;
8479 case Intrinsic::amdgcn_ds_fmax:
8481 break;
8482 default:
8483 llvm_unreachable("Unknown intrinsic!");
8484 }
8485 SDValue Ops[] = {
8486 M->getOperand(0), // Chain
8487 M->getOperand(2), // Ptr
8488 M->getOperand(3) // Value
8489 };
8490
8491 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8492 M->getMemoryVT(), M->getMemOperand());
8493 }
8494 case Intrinsic::amdgcn_buffer_load:
8495 case Intrinsic::amdgcn_buffer_load_format: {
8496 unsigned Glc = Op.getConstantOperandVal(5);
8497 unsigned Slc = Op.getConstantOperandVal(6);
8498 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8499 SDValue Ops[] = {
8500 Op.getOperand(0), // Chain
8501 Op.getOperand(2), // rsrc
8502 Op.getOperand(3), // vindex
8503 SDValue(), // voffset -- will be set by setBufferOffsets
8504 SDValue(), // soffset -- will be set by setBufferOffsets
8505 SDValue(), // offset -- will be set by setBufferOffsets
8506 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8507 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8508 };
8509 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8510
8511 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8513
8514 EVT VT = Op.getValueType();
8515 EVT IntVT = VT.changeTypeToInteger();
8516 auto *M = cast<MemSDNode>(Op);
8517 EVT LoadVT = Op.getValueType();
8518
8519 if (LoadVT.getScalarType() == MVT::f16)
8520 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8521 M, DAG, Ops);
8522
8523 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8524 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8525 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8526 M->getMemOperand());
8527
8528 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8529 M->getMemOperand(), DAG);
8530 }
8531 case Intrinsic::amdgcn_raw_buffer_load:
8532 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8533 case Intrinsic::amdgcn_raw_buffer_load_format:
8534 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8535 const bool IsFormat =
8536 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8537 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8538
8539 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8540 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8541 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8542 SDValue Ops[] = {
8543 Op.getOperand(0), // Chain
8544 Rsrc, // rsrc
8545 DAG.getConstant(0, DL, MVT::i32), // vindex
8546 Offsets.first, // voffset
8547 SOffset, // soffset
8548 Offsets.second, // offset
8549 Op.getOperand(5), // cachepolicy, swizzled buffer
8550 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8551 };
8552
8553 auto *M = cast<MemSDNode>(Op);
8554 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8555 }
8556 case Intrinsic::amdgcn_struct_buffer_load:
8557 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8558 case Intrinsic::amdgcn_struct_buffer_load_format:
8559 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8560 const bool IsFormat =
8561 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8562 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8563
8564 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8565 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8566 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8567 SDValue Ops[] = {
8568 Op.getOperand(0), // Chain
8569 Rsrc, // rsrc
8570 Op.getOperand(3), // vindex
8571 Offsets.first, // voffset
8572 SOffset, // soffset
8573 Offsets.second, // offset
8574 Op.getOperand(6), // cachepolicy, swizzled buffer
8575 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8576 };
8577
8578 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8579 }
8580 case Intrinsic::amdgcn_tbuffer_load: {
8581 MemSDNode *M = cast<MemSDNode>(Op);
8582 EVT LoadVT = Op.getValueType();
8583
8584 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8585 unsigned Dfmt = Op.getConstantOperandVal(7);
8586 unsigned Nfmt = Op.getConstantOperandVal(8);
8587 unsigned Glc = Op.getConstantOperandVal(9);
8588 unsigned Slc = Op.getConstantOperandVal(10);
8589 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8590 SDValue Ops[] = {
8591 Op.getOperand(0), // Chain
8592 Op.getOperand(2), // rsrc
8593 Op.getOperand(3), // vindex
8594 Op.getOperand(4), // voffset
8595 SOffset, // soffset
8596 Op.getOperand(6), // offset
8597 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8598 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8599 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8600 };
8601
8602 if (LoadVT.getScalarType() == MVT::f16)
8603 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8604 M, DAG, Ops);
8605 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8606 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8607 DAG);
8608 }
8609 case Intrinsic::amdgcn_raw_tbuffer_load:
8610 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8611 MemSDNode *M = cast<MemSDNode>(Op);
8612 EVT LoadVT = Op.getValueType();
8613 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8614 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8615 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8616
8617 SDValue Ops[] = {
8618 Op.getOperand(0), // Chain
8619 Rsrc, // rsrc
8620 DAG.getConstant(0, DL, MVT::i32), // vindex
8621 Offsets.first, // voffset
8622 SOffset, // soffset
8623 Offsets.second, // offset
8624 Op.getOperand(5), // format
8625 Op.getOperand(6), // cachepolicy, swizzled buffer
8626 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8627 };
8628
8629 if (LoadVT.getScalarType() == MVT::f16)
8630 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8631 M, DAG, Ops);
8632 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8633 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8634 DAG);
8635 }
8636 case Intrinsic::amdgcn_struct_tbuffer_load:
8637 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8638 MemSDNode *M = cast<MemSDNode>(Op);
8639 EVT LoadVT = Op.getValueType();
8640 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8641 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8642 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8643
8644 SDValue Ops[] = {
8645 Op.getOperand(0), // Chain
8646 Rsrc, // rsrc
8647 Op.getOperand(3), // vindex
8648 Offsets.first, // voffset
8649 SOffset, // soffset
8650 Offsets.second, // offset
8651 Op.getOperand(6), // format
8652 Op.getOperand(7), // cachepolicy, swizzled buffer
8653 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8654 };
8655
8656 if (LoadVT.getScalarType() == MVT::f16)
8657 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8658 M, DAG, Ops);
8659 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8660 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8661 DAG);
8662 }
8663 case Intrinsic::amdgcn_buffer_atomic_swap:
8664 case Intrinsic::amdgcn_buffer_atomic_add:
8665 case Intrinsic::amdgcn_buffer_atomic_sub:
8666 case Intrinsic::amdgcn_buffer_atomic_csub:
8667 case Intrinsic::amdgcn_buffer_atomic_smin:
8668 case Intrinsic::amdgcn_buffer_atomic_umin:
8669 case Intrinsic::amdgcn_buffer_atomic_smax:
8670 case Intrinsic::amdgcn_buffer_atomic_umax:
8671 case Intrinsic::amdgcn_buffer_atomic_and:
8672 case Intrinsic::amdgcn_buffer_atomic_or:
8673 case Intrinsic::amdgcn_buffer_atomic_xor:
8674 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8675 unsigned Slc = Op.getConstantOperandVal(6);
8676 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8677 SDValue Ops[] = {
8678 Op.getOperand(0), // Chain
8679 Op.getOperand(2), // vdata
8680 Op.getOperand(3), // rsrc
8681 Op.getOperand(4), // vindex
8682 SDValue(), // voffset -- will be set by setBufferOffsets
8683 SDValue(), // soffset -- will be set by setBufferOffsets
8684 SDValue(), // offset -- will be set by setBufferOffsets
8685 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8686 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8687 };
8688 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8689
8690 EVT VT = Op.getValueType();
8691
8692 auto *M = cast<MemSDNode>(Op);
8693 unsigned Opcode = 0;
8694
8695 switch (IntrID) {
8696 case Intrinsic::amdgcn_buffer_atomic_swap:
8698 break;
8699 case Intrinsic::amdgcn_buffer_atomic_add:
8701 break;
8702 case Intrinsic::amdgcn_buffer_atomic_sub:
8704 break;
8705 case Intrinsic::amdgcn_buffer_atomic_csub:
8707 break;
8708 case Intrinsic::amdgcn_buffer_atomic_smin:
8710 break;
8711 case Intrinsic::amdgcn_buffer_atomic_umin:
8713 break;
8714 case Intrinsic::amdgcn_buffer_atomic_smax:
8716 break;
8717 case Intrinsic::amdgcn_buffer_atomic_umax:
8719 break;
8720 case Intrinsic::amdgcn_buffer_atomic_and:
8722 break;
8723 case Intrinsic::amdgcn_buffer_atomic_or:
8725 break;
8726 case Intrinsic::amdgcn_buffer_atomic_xor:
8728 break;
8729 case Intrinsic::amdgcn_buffer_atomic_fadd:
8731 break;
8732 default:
8733 llvm_unreachable("unhandled atomic opcode");
8734 }
8735
8736 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8737 M->getMemOperand());
8738 }
8739 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8740 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8741 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8742 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8743 return lowerRawBufferAtomicIntrin(Op, DAG,
8745 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8746 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8747 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8748 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8749 return lowerStructBufferAtomicIntrin(Op, DAG,
8751 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8752 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8753 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8754 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8755 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8756 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8757 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8758 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8759 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8760 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8761 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8762 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8763 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8764 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8765 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
8766 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8767 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8768 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8769 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8770 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8771 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8772 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8773 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8774 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
8775 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8776 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8777 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
8778 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8779 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8780 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
8781 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8782 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8783 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
8784 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8785 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8786 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8787 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8788 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8789 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8790 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8791 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8792 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8793 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8794 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8795 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8796 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8797 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8798 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8799 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8800 return lowerRawBufferAtomicIntrin(Op, DAG,
8802 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8803 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8804 return lowerStructBufferAtomicIntrin(Op, DAG,
8806 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8807 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8808 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8809 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8810 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8811 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8812 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8813 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8814 return lowerStructBufferAtomicIntrin(Op, DAG,
8816 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8817 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8818 return lowerStructBufferAtomicIntrin(Op, DAG,
8820 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8821 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8822 return lowerStructBufferAtomicIntrin(Op, DAG,
8824 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8825 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8826 return lowerStructBufferAtomicIntrin(Op, DAG,
8828 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8829 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8830 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8831 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8832 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8833 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8834 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8835 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8836 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8837 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8838 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8839 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8840 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8842 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8843 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8844 return lowerStructBufferAtomicIntrin(Op, DAG,
8846
8847 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8848 unsigned Slc = Op.getConstantOperandVal(7);
8849 unsigned IdxEn = getIdxEn(Op.getOperand(5));
8850 SDValue Ops[] = {
8851 Op.getOperand(0), // Chain
8852 Op.getOperand(2), // src
8853 Op.getOperand(3), // cmp
8854 Op.getOperand(4), // rsrc
8855 Op.getOperand(5), // vindex
8856 SDValue(), // voffset -- will be set by setBufferOffsets
8857 SDValue(), // soffset -- will be set by setBufferOffsets
8858 SDValue(), // offset -- will be set by setBufferOffsets
8859 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8860 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8861 };
8862 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
8863
8864 EVT VT = Op.getValueType();
8865 auto *M = cast<MemSDNode>(Op);
8866
8868 Op->getVTList(), Ops, VT, M->getMemOperand());
8869 }
8870 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8871 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
8872 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
8873 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8874 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8875 SDValue Ops[] = {
8876 Op.getOperand(0), // Chain
8877 Op.getOperand(2), // src
8878 Op.getOperand(3), // cmp
8879 Rsrc, // rsrc
8880 DAG.getConstant(0, DL, MVT::i32), // vindex
8881 Offsets.first, // voffset
8882 SOffset, // soffset
8883 Offsets.second, // offset
8884 Op.getOperand(7), // cachepolicy
8885 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8886 };
8887 EVT VT = Op.getValueType();
8888 auto *M = cast<MemSDNode>(Op);
8889
8891 Op->getVTList(), Ops, VT, M->getMemOperand());
8892 }
8893 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
8895 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
8896 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
8897 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
8898 SDValue Ops[] = {
8899 Op.getOperand(0), // Chain
8900 Op.getOperand(2), // src
8901 Op.getOperand(3), // cmp
8902 Rsrc, // rsrc
8903 Op.getOperand(5), // vindex
8904 Offsets.first, // voffset
8905 SOffset, // soffset
8906 Offsets.second, // offset
8907 Op.getOperand(8), // cachepolicy
8908 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8909 };
8910 EVT VT = Op.getValueType();
8911 auto *M = cast<MemSDNode>(Op);
8912
8914 Op->getVTList(), Ops, VT, M->getMemOperand());
8915 }
8916 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
8917 MemSDNode *M = cast<MemSDNode>(Op);
8918 SDValue NodePtr = M->getOperand(2);
8919 SDValue RayExtent = M->getOperand(3);
8920 SDValue RayOrigin = M->getOperand(4);
8921 SDValue RayDir = M->getOperand(5);
8922 SDValue RayInvDir = M->getOperand(6);
8923 SDValue TDescr = M->getOperand(7);
8924
8925 assert(NodePtr.getValueType() == MVT::i32 ||
8926 NodePtr.getValueType() == MVT::i64);
8927 assert(RayDir.getValueType() == MVT::v3f16 ||
8928 RayDir.getValueType() == MVT::v3f32);
8929
8930 if (!Subtarget->hasGFX10_AEncoding()) {
8931 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
8932 return SDValue();
8933 }
8934
8935 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
8936 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8937 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8938 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
8939 const bool Is64 = NodePtr.getValueType() == MVT::i64;
8940 const unsigned NumVDataDwords = 4;
8941 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
8942 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
8943 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
8944 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
8945 IsGFX12Plus;
8946 const unsigned BaseOpcodes[2][2] = {
8947 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
8948 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
8949 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
8950 int Opcode;
8951 if (UseNSA) {
8952 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
8953 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
8954 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
8955 : AMDGPU::MIMGEncGfx10NSA,
8956 NumVDataDwords, NumVAddrDwords);
8957 } else {
8958 assert(!IsGFX12Plus);
8959 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
8960 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
8961 : AMDGPU::MIMGEncGfx10Default,
8962 NumVDataDwords, NumVAddrDwords);
8963 }
8964 assert(Opcode != -1);
8965
8967
8968 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
8970 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
8971 if (Lanes[0].getValueSizeInBits() == 32) {
8972 for (unsigned I = 0; I < 3; ++I)
8973 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
8974 } else {
8975 if (IsAligned) {
8976 Ops.push_back(
8977 DAG.getBitcast(MVT::i32,
8978 DAG.getBuildVector(MVT::v2f16, DL,
8979 { Lanes[0], Lanes[1] })));
8980 Ops.push_back(Lanes[2]);
8981 } else {
8982 SDValue Elt0 = Ops.pop_back_val();
8983 Ops.push_back(
8984 DAG.getBitcast(MVT::i32,
8985 DAG.getBuildVector(MVT::v2f16, DL,
8986 { Elt0, Lanes[0] })));
8987 Ops.push_back(
8988 DAG.getBitcast(MVT::i32,
8989 DAG.getBuildVector(MVT::v2f16, DL,
8990 { Lanes[1], Lanes[2] })));
8991 }
8992 }
8993 };
8994
8995 if (UseNSA && IsGFX11Plus) {
8996 Ops.push_back(NodePtr);
8997 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
8998 Ops.push_back(RayOrigin);
8999 if (IsA16) {
9000 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9001 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9002 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9003 for (unsigned I = 0; I < 3; ++I) {
9004 MergedLanes.push_back(DAG.getBitcast(
9005 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9006 {DirLanes[I], InvDirLanes[I]})));
9007 }
9008 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9009 } else {
9010 Ops.push_back(RayDir);
9011 Ops.push_back(RayInvDir);
9012 }
9013 } else {
9014 if (Is64)
9015 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9016 2);
9017 else
9018 Ops.push_back(NodePtr);
9019
9020 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9021 packLanes(RayOrigin, true);
9022 packLanes(RayDir, true);
9023 packLanes(RayInvDir, false);
9024 }
9025
9026 if (!UseNSA) {
9027 // Build a single vector containing all the operands so far prepared.
9028 if (NumVAddrDwords > 12) {
9029 SDValue Undef = DAG.getUNDEF(MVT::i32);
9030 Ops.append(16 - Ops.size(), Undef);
9031 }
9032 assert(Ops.size() >= 8 && Ops.size() <= 12);
9033 SDValue MergedOps = DAG.getBuildVector(
9034 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9035 Ops.clear();
9036 Ops.push_back(MergedOps);
9037 }
9038
9039 Ops.push_back(TDescr);
9040 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9041 Ops.push_back(M->getChain());
9042
9043 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9044 MachineMemOperand *MemRef = M->getMemOperand();
9045 DAG.setNodeMemRefs(NewNode, {MemRef});
9046 return SDValue(NewNode, 0);
9047 }
9048 case Intrinsic::amdgcn_global_atomic_fmin:
9049 case Intrinsic::amdgcn_global_atomic_fmax:
9050 case Intrinsic::amdgcn_global_atomic_fmin_num:
9051 case Intrinsic::amdgcn_global_atomic_fmax_num:
9052 case Intrinsic::amdgcn_flat_atomic_fmin:
9053 case Intrinsic::amdgcn_flat_atomic_fmax:
9054 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9055 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9056 MemSDNode *M = cast<MemSDNode>(Op);
9057 SDValue Ops[] = {
9058 M->getOperand(0), // Chain
9059 M->getOperand(2), // Ptr
9060 M->getOperand(3) // Value
9061 };
9062 unsigned Opcode = 0;
9063 switch (IntrID) {
9064 case Intrinsic::amdgcn_global_atomic_fmin:
9065 case Intrinsic::amdgcn_global_atomic_fmin_num:
9066 case Intrinsic::amdgcn_flat_atomic_fmin:
9067 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9069 break;
9070 }
9071 case Intrinsic::amdgcn_global_atomic_fmax:
9072 case Intrinsic::amdgcn_global_atomic_fmax_num:
9073 case Intrinsic::amdgcn_flat_atomic_fmax:
9074 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9076 break;
9077 }
9078 default:
9079 llvm_unreachable("unhandled atomic opcode");
9080 }
9081 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9082 M->getVTList(), Ops, M->getMemoryVT(),
9083 M->getMemOperand());
9084 }
9085 case Intrinsic::amdgcn_s_get_barrier_state: {
9086 SDValue Chain = Op->getOperand(0);
9088 unsigned Opc;
9089 bool IsInlinableBarID = false;
9090 int64_t BarID;
9091
9092 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9093 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9094 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9095 }
9096
9097 if (IsInlinableBarID) {
9098 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9099 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9100 Ops.push_back(K);
9101 } else {
9102 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9103 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9104 Ops.push_back(M0Val.getValue(0));
9105 }
9106
9107 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9108 return SDValue(NewMI, 0);
9109 }
9110 default:
9111
9112 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9114 return lowerImage(Op, ImageDimIntr, DAG, true);
9115
9116 return SDValue();
9117 }
9118}
9119
9120// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9121// dwordx4 if on SI and handle TFE loads.
9122SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9123 SDVTList VTList,
9124 ArrayRef<SDValue> Ops, EVT MemVT,
9125 MachineMemOperand *MMO,
9126 SelectionDAG &DAG) const {
9127 LLVMContext &C = *DAG.getContext();
9129 EVT VT = VTList.VTs[0];
9130
9131 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9132 bool IsTFE = VTList.NumVTs == 3;
9133 if (IsTFE) {
9134 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9135 unsigned NumOpDWords = NumValueDWords + 1;
9136 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9137 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9138 MachineMemOperand *OpDWordsMMO =
9139 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9140 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9141 OpDWordsVT, OpDWordsMMO, DAG);
9143 DAG.getVectorIdxConstant(NumValueDWords, DL));
9144 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9145 SDValue ValueDWords =
9146 NumValueDWords == 1
9147 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9149 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9150 ZeroIdx);
9151 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9152 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9153 }
9154
9155 if (!Subtarget->hasDwordx3LoadStores() &&
9156 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9157 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9158 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9159 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9160 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9161 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9162 WidenedMemVT, WidenedMMO);
9164 DAG.getVectorIdxConstant(0, DL));
9165 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9166 }
9167
9168 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9169}
9170
9171SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9172 bool ImageStore) const {
9173 EVT StoreVT = VData.getValueType();
9174
9175 // No change for f16 and legal vector D16 types.
9176 if (!StoreVT.isVector())
9177 return VData;
9178
9179 SDLoc DL(VData);
9180 unsigned NumElements = StoreVT.getVectorNumElements();
9181
9182 if (Subtarget->hasUnpackedD16VMem()) {
9183 // We need to unpack the packed data to store.
9184 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9185 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9186
9187 EVT EquivStoreVT =
9188 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9189 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9190 return DAG.UnrollVectorOp(ZExt.getNode());
9191 }
9192
9193 // The sq block of gfx8.1 does not estimate register use correctly for d16
9194 // image store instructions. The data operand is computed as if it were not a
9195 // d16 image instruction.
9196 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9197 // Bitcast to i16
9198 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9199 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9200
9201 // Decompose into scalars
9203 DAG.ExtractVectorElements(IntVData, Elts);
9204
9205 // Group pairs of i16 into v2i16 and bitcast to i32
9206 SmallVector<SDValue, 4> PackedElts;
9207 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9208 SDValue Pair =
9209 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9210 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9211 PackedElts.push_back(IntPair);
9212 }
9213 if ((NumElements % 2) == 1) {
9214 // Handle v3i16
9215 unsigned I = Elts.size() / 2;
9216 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9217 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9218 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9219 PackedElts.push_back(IntPair);
9220 }
9221
9222 // Pad using UNDEF
9223 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9224
9225 // Build final vector
9226 EVT VecVT =
9227 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9228 return DAG.getBuildVector(VecVT, DL, PackedElts);
9229 }
9230
9231 if (NumElements == 3) {
9232 EVT IntStoreVT =
9234 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9235
9236 EVT WidenedStoreVT = EVT::getVectorVT(
9237 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9238 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9239 WidenedStoreVT.getStoreSizeInBits());
9240 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9241 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9242 }
9243
9244 assert(isTypeLegal(StoreVT));
9245 return VData;
9246}
9247
9248SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9249 SelectionDAG &DAG) const {
9250 SDLoc DL(Op);
9251 SDValue Chain = Op.getOperand(0);
9252 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9254
9255 switch (IntrinsicID) {
9256 case Intrinsic::amdgcn_exp_compr: {
9257 if (!Subtarget->hasCompressedExport()) {
9258 DiagnosticInfoUnsupported BadIntrin(
9260 "intrinsic not supported on subtarget", DL.getDebugLoc());
9261 DAG.getContext()->diagnose(BadIntrin);
9262 }
9263 SDValue Src0 = Op.getOperand(4);
9264 SDValue Src1 = Op.getOperand(5);
9265 // Hack around illegal type on SI by directly selecting it.
9266 if (isTypeLegal(Src0.getValueType()))
9267 return SDValue();
9268
9269 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9270 SDValue Undef = DAG.getUNDEF(MVT::f32);
9271 const SDValue Ops[] = {
9272 Op.getOperand(2), // tgt
9273 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9274 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9275 Undef, // src2
9276 Undef, // src3
9277 Op.getOperand(7), // vm
9278 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9279 Op.getOperand(3), // en
9280 Op.getOperand(0) // Chain
9281 };
9282
9283 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9284 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9285 }
9286 case Intrinsic::amdgcn_s_barrier: {
9289 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9290 if (WGSize <= ST.getWavefrontSize())
9291 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9292 Op.getOperand(0)), 0);
9293 }
9294
9295 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9296 if (ST.hasSplitBarriers()) {
9297 SDValue K =
9299 SDValue BarSignal =
9300 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9301 MVT::Other, K, Op.getOperand(0)),
9302 0);
9303 SDValue BarWait =
9304 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9305 BarSignal.getValue(0)),
9306 0);
9307 return BarWait;
9308 }
9309
9310 return SDValue();
9311 };
9312 case Intrinsic::amdgcn_tbuffer_store: {
9313 SDValue VData = Op.getOperand(2);
9314 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9315 if (IsD16)
9316 VData = handleD16VData(VData, DAG);
9317 unsigned Dfmt = Op.getConstantOperandVal(8);
9318 unsigned Nfmt = Op.getConstantOperandVal(9);
9319 unsigned Glc = Op.getConstantOperandVal(10);
9320 unsigned Slc = Op.getConstantOperandVal(11);
9321 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9322 SDValue Ops[] = {
9323 Chain,
9324 VData, // vdata
9325 Op.getOperand(3), // rsrc
9326 Op.getOperand(4), // vindex
9327 Op.getOperand(5), // voffset
9328 Op.getOperand(6), // soffset
9329 Op.getOperand(7), // offset
9330 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9331 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9332 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9333 };
9334 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9336 MemSDNode *M = cast<MemSDNode>(Op);
9337 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9338 M->getMemoryVT(), M->getMemOperand());
9339 }
9340
9341 case Intrinsic::amdgcn_struct_tbuffer_store:
9342 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9343 SDValue VData = Op.getOperand(2);
9344 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9345 if (IsD16)
9346 VData = handleD16VData(VData, DAG);
9347 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9348 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9349 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9350 SDValue Ops[] = {
9351 Chain,
9352 VData, // vdata
9353 Rsrc, // rsrc
9354 Op.getOperand(4), // vindex
9355 Offsets.first, // voffset
9356 SOffset, // soffset
9357 Offsets.second, // offset
9358 Op.getOperand(7), // format
9359 Op.getOperand(8), // cachepolicy, swizzled buffer
9360 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9361 };
9362 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9364 MemSDNode *M = cast<MemSDNode>(Op);
9365 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9366 M->getMemoryVT(), M->getMemOperand());
9367 }
9368
9369 case Intrinsic::amdgcn_raw_tbuffer_store:
9370 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9371 SDValue VData = Op.getOperand(2);
9372 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9373 if (IsD16)
9374 VData = handleD16VData(VData, DAG);
9375 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9376 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9377 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9378 SDValue Ops[] = {
9379 Chain,
9380 VData, // vdata
9381 Rsrc, // rsrc
9382 DAG.getConstant(0, DL, MVT::i32), // vindex
9383 Offsets.first, // voffset
9384 SOffset, // soffset
9385 Offsets.second, // offset
9386 Op.getOperand(6), // format
9387 Op.getOperand(7), // cachepolicy, swizzled buffer
9388 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9389 };
9390 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9392 MemSDNode *M = cast<MemSDNode>(Op);
9393 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9394 M->getMemoryVT(), M->getMemOperand());
9395 }
9396
9397 case Intrinsic::amdgcn_buffer_store:
9398 case Intrinsic::amdgcn_buffer_store_format: {
9399 SDValue VData = Op.getOperand(2);
9400 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9401 if (IsD16)
9402 VData = handleD16VData(VData, DAG);
9403 unsigned Glc = Op.getConstantOperandVal(6);
9404 unsigned Slc = Op.getConstantOperandVal(7);
9405 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9406 SDValue Ops[] = {
9407 Chain,
9408 VData,
9409 Op.getOperand(3), // rsrc
9410 Op.getOperand(4), // vindex
9411 SDValue(), // voffset -- will be set by setBufferOffsets
9412 SDValue(), // soffset -- will be set by setBufferOffsets
9413 SDValue(), // offset -- will be set by setBufferOffsets
9414 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9415 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9416 };
9417 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9418
9419 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9421 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9422 MemSDNode *M = cast<MemSDNode>(Op);
9423
9424 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9425 EVT VDataType = VData.getValueType().getScalarType();
9426 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9427 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9428
9429 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9430 M->getMemoryVT(), M->getMemOperand());
9431 }
9432
9433 case Intrinsic::amdgcn_raw_buffer_store:
9434 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9435 case Intrinsic::amdgcn_raw_buffer_store_format:
9436 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9437 const bool IsFormat =
9438 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9439 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9440
9441 SDValue VData = Op.getOperand(2);
9442 EVT VDataVT = VData.getValueType();
9443 EVT EltType = VDataVT.getScalarType();
9444 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9445 if (IsD16) {
9446 VData = handleD16VData(VData, DAG);
9447 VDataVT = VData.getValueType();
9448 }
9449
9450 if (!isTypeLegal(VDataVT)) {
9451 VData =
9452 DAG.getNode(ISD::BITCAST, DL,
9453 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9454 }
9455
9456 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9457 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9458 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9459 SDValue Ops[] = {
9460 Chain,
9461 VData,
9462 Rsrc,
9463 DAG.getConstant(0, DL, MVT::i32), // vindex
9464 Offsets.first, // voffset
9465 SOffset, // soffset
9466 Offsets.second, // offset
9467 Op.getOperand(6), // cachepolicy, swizzled buffer
9468 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9469 };
9470 unsigned Opc =
9472 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9473 MemSDNode *M = cast<MemSDNode>(Op);
9474
9475 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9476 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9477 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9478
9479 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9480 M->getMemoryVT(), M->getMemOperand());
9481 }
9482
9483 case Intrinsic::amdgcn_struct_buffer_store:
9484 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9485 case Intrinsic::amdgcn_struct_buffer_store_format:
9486 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9487 const bool IsFormat =
9488 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9489 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9490
9491 SDValue VData = Op.getOperand(2);
9492 EVT VDataVT = VData.getValueType();
9493 EVT EltType = VDataVT.getScalarType();
9494 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9495
9496 if (IsD16) {
9497 VData = handleD16VData(VData, DAG);
9498 VDataVT = VData.getValueType();
9499 }
9500
9501 if (!isTypeLegal(VDataVT)) {
9502 VData =
9503 DAG.getNode(ISD::BITCAST, DL,
9504 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9505 }
9506
9507 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9508 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9509 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9510 SDValue Ops[] = {
9511 Chain,
9512 VData,
9513 Rsrc,
9514 Op.getOperand(4), // vindex
9515 Offsets.first, // voffset
9516 SOffset, // soffset
9517 Offsets.second, // offset
9518 Op.getOperand(7), // cachepolicy, swizzled buffer
9519 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9520 };
9521 unsigned Opc =
9523 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9524 MemSDNode *M = cast<MemSDNode>(Op);
9525
9526 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9527 EVT VDataType = VData.getValueType().getScalarType();
9528 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9529 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9530
9531 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9532 M->getMemoryVT(), M->getMemOperand());
9533 }
9534 case Intrinsic::amdgcn_raw_buffer_load_lds:
9535 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9536 case Intrinsic::amdgcn_struct_buffer_load_lds:
9537 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9538 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9539 unsigned Opc;
9540 bool HasVIndex =
9541 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9542 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9543 unsigned OpOffset = HasVIndex ? 1 : 0;
9544 SDValue VOffset = Op.getOperand(5 + OpOffset);
9545 bool HasVOffset = !isNullConstant(VOffset);
9546 unsigned Size = Op->getConstantOperandVal(4);
9547
9548 switch (Size) {
9549 default:
9550 return SDValue();
9551 case 1:
9552 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9553 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9554 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9555 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9556 break;
9557 case 2:
9558 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9559 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9560 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9561 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9562 break;
9563 case 4:
9564 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9565 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9566 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9567 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9568 break;
9569 }
9570
9571 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9572
9574
9575 if (HasVIndex && HasVOffset)
9576 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9577 { Op.getOperand(5), // VIndex
9578 VOffset }));
9579 else if (HasVIndex)
9580 Ops.push_back(Op.getOperand(5));
9581 else if (HasVOffset)
9582 Ops.push_back(VOffset);
9583
9584 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9585 Ops.push_back(Rsrc);
9586 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9587 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9588 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9589 Ops.push_back(
9590 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9592 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9593 Ops.push_back(M0Val.getValue(0)); // Chain
9594 Ops.push_back(M0Val.getValue(1)); // Glue
9595
9596 auto *M = cast<MemSDNode>(Op);
9597 MachineMemOperand *LoadMMO = M->getMemOperand();
9598 // Don't set the offset value here because the pointer points to the base of
9599 // the buffer.
9600 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9601
9602 MachinePointerInfo StorePtrI = LoadPtrI;
9603 LoadPtrI.V = PoisonValue::get(
9607
9608 auto F = LoadMMO->getFlags() &
9610 LoadMMO =
9612 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9613
9615 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9616 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9617
9618 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9619 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9620
9621 return SDValue(Load, 0);
9622 }
9623 case Intrinsic::amdgcn_global_load_lds: {
9624 unsigned Opc;
9625 unsigned Size = Op->getConstantOperandVal(4);
9626 switch (Size) {
9627 default:
9628 return SDValue();
9629 case 1:
9630 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9631 break;
9632 case 2:
9633 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9634 break;
9635 case 4:
9636 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9637 break;
9638 }
9639
9640 auto *M = cast<MemSDNode>(Op);
9641 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9642
9644
9645 SDValue Addr = Op.getOperand(2); // Global ptr
9646 SDValue VOffset;
9647 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9648 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9649 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9650 SDValue LHS = Addr.getOperand(0);
9651 SDValue RHS = Addr.getOperand(1);
9652
9653 if (LHS->isDivergent())
9654 std::swap(LHS, RHS);
9655
9656 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9657 RHS.getOperand(0).getValueType() == MVT::i32) {
9658 // add (i64 sgpr), (zero_extend (i32 vgpr))
9659 Addr = LHS;
9660 VOffset = RHS.getOperand(0);
9661 }
9662 }
9663
9664 Ops.push_back(Addr);
9665 if (!Addr->isDivergent()) {
9666 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9667 if (!VOffset)
9668 VOffset = SDValue(
9669 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9670 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9671 Ops.push_back(VOffset);
9672 }
9673
9674 Ops.push_back(Op.getOperand(5)); // Offset
9675 Ops.push_back(Op.getOperand(6)); // CPol
9676 Ops.push_back(M0Val.getValue(0)); // Chain
9677 Ops.push_back(M0Val.getValue(1)); // Glue
9678
9679 MachineMemOperand *LoadMMO = M->getMemOperand();
9680 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9681 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9682 MachinePointerInfo StorePtrI = LoadPtrI;
9683 LoadPtrI.V = PoisonValue::get(
9687 auto F = LoadMMO->getFlags() &
9689 LoadMMO =
9691 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9693 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9694 LoadMMO->getAAInfo());
9695
9696 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9697 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9698
9699 return SDValue(Load, 0);
9700 }
9701 case Intrinsic::amdgcn_end_cf:
9702 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9703 Op->getOperand(2), Chain), 0);
9704 case Intrinsic::amdgcn_s_barrier_init:
9705 case Intrinsic::amdgcn_s_barrier_join:
9706 case Intrinsic::amdgcn_s_wakeup_barrier: {
9707 SDValue Chain = Op->getOperand(0);
9709 SDValue BarOp = Op->getOperand(2);
9710 unsigned Opc;
9711 bool IsInlinableBarID = false;
9712 int64_t BarVal;
9713
9714 if (isa<ConstantSDNode>(BarOp)) {
9715 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9716 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9717 }
9718
9719 if (IsInlinableBarID) {
9720 switch (IntrinsicID) {
9721 default:
9722 return SDValue();
9723 case Intrinsic::amdgcn_s_barrier_init:
9724 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9725 break;
9726 case Intrinsic::amdgcn_s_barrier_join:
9727 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9728 break;
9729 case Intrinsic::amdgcn_s_wakeup_barrier:
9730 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9731 break;
9732 }
9733
9734 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9735 Ops.push_back(K);
9736 } else {
9737 switch (IntrinsicID) {
9738 default:
9739 return SDValue();
9740 case Intrinsic::amdgcn_s_barrier_init:
9741 Opc = AMDGPU::S_BARRIER_INIT_M0;
9742 break;
9743 case Intrinsic::amdgcn_s_barrier_join:
9744 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9745 break;
9746 case Intrinsic::amdgcn_s_wakeup_barrier:
9747 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9748 break;
9749 }
9750 }
9751
9752 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9753 SDValue M0Val;
9754 // Member count will be read from M0[16:22]
9755 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9756 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9757
9758 if (!IsInlinableBarID) {
9759 // If reference to barrier id is not an inline constant then it must be
9760 // referenced with M0[4:0]. Perform an OR with the member count to
9761 // include it in M0.
9762 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9763 Op.getOperand(2), M0Val),
9764 0);
9765 }
9766 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9767 } else if (!IsInlinableBarID) {
9768 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9769 }
9770
9771 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9772 return SDValue(NewMI, 0);
9773 }
9774 default: {
9775 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9777 return lowerImage(Op, ImageDimIntr, DAG, true);
9778
9779 return Op;
9780 }
9781 }
9782}
9783
9784// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9785// offset (the offset that is included in bounds checking and swizzling, to be
9786// split between the instruction's voffset and immoffset fields) and soffset
9787// (the offset that is excluded from bounds checking and swizzling, to go in
9788// the instruction's soffset field). This function takes the first kind of
9789// offset and figures out how to split it between voffset and immoffset.
9790std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9791 SDValue Offset, SelectionDAG &DAG) const {
9792 SDLoc DL(Offset);
9793 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9794 SDValue N0 = Offset;
9795 ConstantSDNode *C1 = nullptr;
9796
9797 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9798 N0 = SDValue();
9799 else if (DAG.isBaseWithConstantOffset(N0)) {
9800 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9801 N0 = N0.getOperand(0);
9802 }
9803
9804 if (C1) {
9805 unsigned ImmOffset = C1->getZExtValue();
9806 // If the immediate value is too big for the immoffset field, put only bits
9807 // that would normally fit in the immoffset field. The remaining value that
9808 // is copied/added for the voffset field is a large power of 2, and it
9809 // stands more chance of being CSEd with the copy/add for another similar
9810 // load/store.
9811 // However, do not do that rounding down if that is a negative
9812 // number, as it appears to be illegal to have a negative offset in the
9813 // vgpr, even if adding the immediate offset makes it positive.
9814 unsigned Overflow = ImmOffset & ~MaxImm;
9815 ImmOffset -= Overflow;
9816 if ((int32_t)Overflow < 0) {
9817 Overflow += ImmOffset;
9818 ImmOffset = 0;
9819 }
9820 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9821 if (Overflow) {
9822 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9823 if (!N0)
9824 N0 = OverflowVal;
9825 else {
9826 SDValue Ops[] = { N0, OverflowVal };
9827 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9828 }
9829 }
9830 }
9831 if (!N0)
9832 N0 = DAG.getConstant(0, DL, MVT::i32);
9833 if (!C1)
9834 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9835 return {N0, SDValue(C1, 0)};
9836}
9837
9838// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
9839// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9840// pointed to by Offsets.
9841void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9842 SelectionDAG &DAG, SDValue *Offsets,
9843 Align Alignment) const {
9845 SDLoc DL(CombinedOffset);
9846 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9847 uint32_t Imm = C->getZExtValue();
9848 uint32_t SOffset, ImmOffset;
9849 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9850 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
9851 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9852 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9853 return;
9854 }
9855 }
9856 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
9857 SDValue N0 = CombinedOffset.getOperand(0);
9858 SDValue N1 = CombinedOffset.getOperand(1);
9859 uint32_t SOffset, ImmOffset;
9860 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9861 if (Offset >= 0 &&
9862 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
9863 Offsets[0] = N0;
9864 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9865 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9866 return;
9867 }
9868 }
9869
9870 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
9871 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
9872 : DAG.getConstant(0, DL, MVT::i32);
9873
9874 Offsets[0] = CombinedOffset;
9875 Offsets[1] = SOffsetZero;
9876 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
9877}
9878
9879SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
9880 SelectionDAG &DAG) const {
9881 if (!MaybePointer.getValueType().isScalarInteger())
9882 return MaybePointer;
9883
9884 SDLoc DL(MaybePointer);
9885
9886 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
9887 return Rsrc;
9888}
9889
9890// Wrap a global or flat pointer into a buffer intrinsic using the flags
9891// specified in the intrinsic.
9892SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
9893 SelectionDAG &DAG) const {
9894 SDLoc Loc(Op);
9895
9896 SDValue Pointer = Op->getOperand(1);
9897 SDValue Stride = Op->getOperand(2);
9898 SDValue NumRecords = Op->getOperand(3);
9899 SDValue Flags = Op->getOperand(4);
9900
9901 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
9902 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
9903 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
9904 std::optional<uint32_t> ConstStride = std::nullopt;
9905 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
9906 ConstStride = ConstNode->getZExtValue();
9907
9908 SDValue NewHighHalf = Masked;
9909 if (!ConstStride || *ConstStride != 0) {
9910 SDValue ShiftedStride;
9911 if (ConstStride) {
9912 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
9913 } else {
9914 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
9915 ShiftedStride =
9916 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
9917 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
9918 }
9919 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
9920 }
9921
9922 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
9923 NewHighHalf, NumRecords, Flags);
9924 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
9925 return RsrcPtr;
9926}
9927
9928// Handle 8 bit and 16 bit buffer loads
9929SDValue
9930SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
9932 MachineMemOperand *MMO) const {
9933 EVT IntVT = LoadVT.changeTypeToInteger();
9934 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
9936
9937 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
9938 SDValue BufferLoad =
9939 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
9940 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
9941 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
9942
9943 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
9944}
9945
9946// Handle 8 bit and 16 bit buffer stores
9947SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
9948 EVT VDataType, SDLoc DL,
9949 SDValue Ops[],
9950 MemSDNode *M) const {
9951 if (VDataType == MVT::f16)
9952 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
9953
9954 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
9955 Ops[1] = BufferStoreExt;
9956 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
9958 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
9959 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
9960 M->getMemOperand());
9961}
9962
9964 ISD::LoadExtType ExtType, SDValue Op,
9965 const SDLoc &SL, EVT VT) {
9966 if (VT.bitsLT(Op.getValueType()))
9967 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
9968
9969 switch (ExtType) {
9970 case ISD::SEXTLOAD:
9971 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
9972 case ISD::ZEXTLOAD:
9973 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
9974 case ISD::EXTLOAD:
9975 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
9976 case ISD::NON_EXTLOAD:
9977 return Op;
9978 }
9979
9980 llvm_unreachable("invalid ext type");
9981}
9982
9983// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
9984// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
9985SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
9986 SelectionDAG &DAG = DCI.DAG;
9987 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
9988 return SDValue();
9989
9990 // FIXME: Constant loads should all be marked invariant.
9991 unsigned AS = Ld->getAddressSpace();
9992 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
9994 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
9995 return SDValue();
9996
9997 // Don't do this early, since it may interfere with adjacent load merging for
9998 // illegal types. We can avoid losing alignment information for exotic types
9999 // pre-legalize.
10000 EVT MemVT = Ld->getMemoryVT();
10001 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10002 MemVT.getSizeInBits() >= 32)
10003 return SDValue();
10004
10005 SDLoc SL(Ld);
10006
10007 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10008 "unexpected vector extload");
10009
10010 // TODO: Drop only high part of range.
10011 SDValue Ptr = Ld->getBasePtr();
10012 SDValue NewLoad = DAG.getLoad(
10013 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10014 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10015 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10016 nullptr); // Drop ranges
10017
10018 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10019 if (MemVT.isFloatingPoint()) {
10021 "unexpected fp extload");
10022 TruncVT = MemVT.changeTypeToInteger();
10023 }
10024
10025 SDValue Cvt = NewLoad;
10026 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10027 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10028 DAG.getValueType(TruncVT));
10029 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10031 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10032 } else {
10034 }
10035
10036 EVT VT = Ld->getValueType(0);
10037 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10038
10039 DCI.AddToWorklist(Cvt.getNode());
10040
10041 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10042 // the appropriate extension from the 32-bit load.
10043 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10044 DCI.AddToWorklist(Cvt.getNode());
10045
10046 // Handle conversion back to floating point if necessary.
10047 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10048
10049 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10050}
10051
10053 const SIMachineFunctionInfo &Info) {
10054 // TODO: Should check if the address can definitely not access stack.
10055 if (Info.isEntryFunction())
10056 return Info.getUserSGPRInfo().hasFlatScratchInit();
10057 return true;
10058}
10059
10060SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10061 SDLoc DL(Op);
10062 LoadSDNode *Load = cast<LoadSDNode>(Op);
10063 ISD::LoadExtType ExtType = Load->getExtensionType();
10064 EVT MemVT = Load->getMemoryVT();
10065
10066 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10067 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10068 return SDValue();
10069
10070 // FIXME: Copied from PPC
10071 // First, load into 32 bits, then truncate to 1 bit.
10072
10073 SDValue Chain = Load->getChain();
10074 SDValue BasePtr = Load->getBasePtr();
10075 MachineMemOperand *MMO = Load->getMemOperand();
10076
10077 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10078
10079 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10080 BasePtr, RealMemVT, MMO);
10081
10082 if (!MemVT.isVector()) {
10083 SDValue Ops[] = {
10084 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10085 NewLD.getValue(1)
10086 };
10087
10088 return DAG.getMergeValues(Ops, DL);
10089 }
10090
10092 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10093 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10094 DAG.getConstant(I, DL, MVT::i32));
10095
10096 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10097 }
10098
10099 SDValue Ops[] = {
10100 DAG.getBuildVector(MemVT, DL, Elts),
10101 NewLD.getValue(1)
10102 };
10103
10104 return DAG.getMergeValues(Ops, DL);
10105 }
10106
10107 if (!MemVT.isVector())
10108 return SDValue();
10109
10110 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10111 "Custom lowering for non-i32 vectors hasn't been implemented.");
10112
10113 Align Alignment = Load->getAlign();
10114 unsigned AS = Load->getAddressSpace();
10115 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10116 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10117 return SplitVectorLoad(Op, DAG);
10118 }
10119
10122 // If there is a possibility that flat instruction access scratch memory
10123 // then we need to use the same legalization rules we use for private.
10124 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10126 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10128
10129 unsigned NumElements = MemVT.getVectorNumElements();
10130
10131 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10133 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10134 if (MemVT.isPow2VectorType() ||
10135 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10136 return SDValue();
10137 return WidenOrSplitVectorLoad(Op, DAG);
10138 }
10139 // Non-uniform loads will be selected to MUBUF instructions, so they
10140 // have the same legalization requirements as global and private
10141 // loads.
10142 //
10143 }
10144
10145 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10148 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10149 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10150 Alignment >= Align(4) && NumElements < 32) {
10151 if (MemVT.isPow2VectorType() ||
10152 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10153 return SDValue();
10154 return WidenOrSplitVectorLoad(Op, DAG);
10155 }
10156 // Non-uniform loads will be selected to MUBUF instructions, so they
10157 // have the same legalization requirements as global and private
10158 // loads.
10159 //
10160 }
10161 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10164 AS == AMDGPUAS::FLAT_ADDRESS) {
10165 if (NumElements > 4)
10166 return SplitVectorLoad(Op, DAG);
10167 // v3 loads not supported on SI.
10168 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10169 return WidenOrSplitVectorLoad(Op, DAG);
10170
10171 // v3 and v4 loads are supported for private and global memory.
10172 return SDValue();
10173 }
10174 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10175 // Depending on the setting of the private_element_size field in the
10176 // resource descriptor, we can only make private accesses up to a certain
10177 // size.
10178 switch (Subtarget->getMaxPrivateElementSize()) {
10179 case 4: {
10180 SDValue Ops[2];
10181 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10182 return DAG.getMergeValues(Ops, DL);
10183 }
10184 case 8:
10185 if (NumElements > 2)
10186 return SplitVectorLoad(Op, DAG);
10187 return SDValue();
10188 case 16:
10189 // Same as global/flat
10190 if (NumElements > 4)
10191 return SplitVectorLoad(Op, DAG);
10192 // v3 loads not supported on SI.
10193 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10194 return WidenOrSplitVectorLoad(Op, DAG);
10195
10196 return SDValue();
10197 default:
10198 llvm_unreachable("unsupported private_element_size");
10199 }
10200 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10201 unsigned Fast = 0;
10202 auto Flags = Load->getMemOperand()->getFlags();
10204 Load->getAlign(), Flags, &Fast) &&
10205 Fast > 1)
10206 return SDValue();
10207
10208 if (MemVT.isVector())
10209 return SplitVectorLoad(Op, DAG);
10210 }
10211
10213 MemVT, *Load->getMemOperand())) {
10214 SDValue Ops[2];
10215 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10216 return DAG.getMergeValues(Ops, DL);
10217 }
10218
10219 return SDValue();
10220}
10221
10222SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10223 EVT VT = Op.getValueType();
10224 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10225 VT.getSizeInBits() == 512)
10226 return splitTernaryVectorOp(Op, DAG);
10227
10228 assert(VT.getSizeInBits() == 64);
10229
10230 SDLoc DL(Op);
10231 SDValue Cond = Op.getOperand(0);
10232
10233 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10234 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10235
10236 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10237 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10238
10239 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10240 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10241
10242 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10243
10244 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10245 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10246
10247 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10248
10249 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10250 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10251}
10252
10253// Catch division cases where we can use shortcuts with rcp and rsq
10254// instructions.
10255SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10256 SelectionDAG &DAG) const {
10257 SDLoc SL(Op);
10258 SDValue LHS = Op.getOperand(0);
10259 SDValue RHS = Op.getOperand(1);
10260 EVT VT = Op.getValueType();
10261 const SDNodeFlags Flags = Op->getFlags();
10262
10263 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10265
10266 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10267 // Without !fpmath accuracy information, we can't do more because we don't
10268 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10269 // f16 is always accurate enough
10270 if (!AllowInaccurateRcp && VT != MVT::f16)
10271 return SDValue();
10272
10273 if (CLHS->isExactlyValue(1.0)) {
10274 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10275 // the CI documentation has a worst case error of 1 ulp.
10276 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10277 // use it as long as we aren't trying to use denormals.
10278 //
10279 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10280
10281 // 1.0 / sqrt(x) -> rsq(x)
10282
10283 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10284 // error seems really high at 2^29 ULP.
10285 // 1.0 / x -> rcp(x)
10286 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10287 }
10288
10289 // Same as for 1.0, but expand the sign out of the constant.
10290 if (CLHS->isExactlyValue(-1.0)) {
10291 // -1.0 / x -> rcp (fneg x)
10292 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10293 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10294 }
10295 }
10296
10297 // For f16 require afn or arcp.
10298 // For f32 require afn.
10299 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10300 return SDValue();
10301
10302 // Turn into multiply by the reciprocal.
10303 // x / y -> x * (1.0 / y)
10304 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10305 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10306}
10307
10308SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10309 SelectionDAG &DAG) const {
10310 SDLoc SL(Op);
10311 SDValue X = Op.getOperand(0);
10312 SDValue Y = Op.getOperand(1);
10313 EVT VT = Op.getValueType();
10314 const SDNodeFlags Flags = Op->getFlags();
10315
10316 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10318 if (!AllowInaccurateDiv)
10319 return SDValue();
10320
10321 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10322 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10323
10324 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10325 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10326
10327 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10328 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10329 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10330 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10331 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10332 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10333}
10334
10335static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10336 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10337 SDNodeFlags Flags) {
10338 if (GlueChain->getNumValues() <= 1) {
10339 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10340 }
10341
10342 assert(GlueChain->getNumValues() == 3);
10343
10344 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10345 switch (Opcode) {
10346 default: llvm_unreachable("no chain equivalent for opcode");
10347 case ISD::FMUL:
10348 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10349 break;
10350 }
10351
10352 return DAG.getNode(Opcode, SL, VTList,
10353 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10354 Flags);
10355}
10356
10357static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10358 EVT VT, SDValue A, SDValue B, SDValue C,
10359 SDValue GlueChain, SDNodeFlags Flags) {
10360 if (GlueChain->getNumValues() <= 1) {
10361 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10362 }
10363
10364 assert(GlueChain->getNumValues() == 3);
10365
10366 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10367 switch (Opcode) {
10368 default: llvm_unreachable("no chain equivalent for opcode");
10369 case ISD::FMA:
10370 Opcode = AMDGPUISD::FMA_W_CHAIN;
10371 break;
10372 }
10373
10374 return DAG.getNode(Opcode, SL, VTList,
10375 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10376 Flags);
10377}
10378
10379SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10380 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10381 return FastLowered;
10382
10383 SDLoc SL(Op);
10384 SDValue Src0 = Op.getOperand(0);
10385 SDValue Src1 = Op.getOperand(1);
10386
10387 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10388 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10389
10390 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10391 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10392
10393 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10394 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10395
10396 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10397}
10398
10399// Faster 2.5 ULP division that does not support denormals.
10400SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10401 SDNodeFlags Flags = Op->getFlags();
10402 SDLoc SL(Op);
10403 SDValue LHS = Op.getOperand(1);
10404 SDValue RHS = Op.getOperand(2);
10405
10406 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10407
10408 const APFloat K0Val(0x1p+96f);
10409 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10410
10411 const APFloat K1Val(0x1p-32f);
10412 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10413
10414 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10415
10416 EVT SetCCVT =
10417 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10418
10419 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10420
10421 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10422
10423 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10424
10425 // rcp does not support denormals.
10426 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10427
10428 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10429
10430 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10431}
10432
10433// Returns immediate value for setting the F32 denorm mode when using the
10434// S_DENORM_MODE instruction.
10436 const SIMachineFunctionInfo *Info,
10437 const GCNSubtarget *ST) {
10438 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10439 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10440 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10441 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10442}
10443
10444SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10445 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10446 return FastLowered;
10447
10448 // The selection matcher assumes anything with a chain selecting to a
10449 // mayRaiseFPException machine instruction. Since we're introducing a chain
10450 // here, we need to explicitly report nofpexcept for the regular fdiv
10451 // lowering.
10452 SDNodeFlags Flags = Op->getFlags();
10453 Flags.setNoFPExcept(true);
10454
10455 SDLoc SL(Op);
10456 SDValue LHS = Op.getOperand(0);
10457 SDValue RHS = Op.getOperand(1);
10458
10459 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10460
10461 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10462
10463 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10464 {RHS, RHS, LHS}, Flags);
10465 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10466 {LHS, RHS, LHS}, Flags);
10467
10468 // Denominator is scaled to not be denormal, so using rcp is ok.
10469 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10470 DenominatorScaled, Flags);
10471 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10472 DenominatorScaled, Flags);
10473
10474 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
10477 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10478
10479 const MachineFunction &MF = DAG.getMachineFunction();
10481 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10482
10483 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10484 const bool HasDynamicDenormals =
10485 (DenormMode.Input == DenormalMode::Dynamic) ||
10486 (DenormMode.Output == DenormalMode::Dynamic);
10487
10488 SDValue SavedDenormMode;
10489
10490 if (!PreservesDenormals) {
10491 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10492 // lowering. The chain dependence is insufficient, and we need glue. We do
10493 // not need the glue variants in a strictfp function.
10494
10495 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10496
10497 SDValue Glue = DAG.getEntryNode();
10498 if (HasDynamicDenormals) {
10499 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10500 DAG.getVTList(MVT::i32, MVT::Glue),
10501 {BitField, Glue});
10502 SavedDenormMode = SDValue(GetReg, 0);
10503
10504 Glue = DAG.getMergeValues(
10505 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10506 }
10507
10508 SDNode *EnableDenorm;
10509 if (Subtarget->hasDenormModeInst()) {
10510 const SDValue EnableDenormValue =
10511 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10512
10513 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10514 EnableDenormValue)
10515 .getNode();
10516 } else {
10517 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10518 SL, MVT::i32);
10519 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10520 {EnableDenormValue, BitField, Glue});
10521 }
10522
10523 SDValue Ops[3] = {
10524 NegDivScale0,
10525 SDValue(EnableDenorm, 0),
10526 SDValue(EnableDenorm, 1)
10527 };
10528
10529 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10530 }
10531
10532 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10533 ApproxRcp, One, NegDivScale0, Flags);
10534
10535 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10536 ApproxRcp, Fma0, Flags);
10537
10538 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10539 Fma1, Fma1, Flags);
10540
10541 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10542 NumeratorScaled, Mul, Flags);
10543
10544 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10545 Fma2, Fma1, Mul, Fma2, Flags);
10546
10547 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10548 NumeratorScaled, Fma3, Flags);
10549
10550 if (!PreservesDenormals) {
10551 SDNode *DisableDenorm;
10552 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10553 const SDValue DisableDenormValue = getSPDenormModeValue(
10554 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10555
10556 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10557 Fma4.getValue(1), DisableDenormValue,
10558 Fma4.getValue(2)).getNode();
10559 } else {
10560 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10561 const SDValue DisableDenormValue =
10562 HasDynamicDenormals
10563 ? SavedDenormMode
10564 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10565
10566 DisableDenorm = DAG.getMachineNode(
10567 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10568 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10569 }
10570
10571 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10572 SDValue(DisableDenorm, 0), DAG.getRoot());
10573 DAG.setRoot(OutputChain);
10574 }
10575
10576 SDValue Scale = NumeratorScaled.getValue(1);
10577 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10578 {Fma4, Fma1, Fma3, Scale}, Flags);
10579
10580 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10581}
10582
10583SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10584 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10585 return FastLowered;
10586
10587 SDLoc SL(Op);
10588 SDValue X = Op.getOperand(0);
10589 SDValue Y = Op.getOperand(1);
10590
10591 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10592
10593 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10594
10595 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10596
10597 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10598
10599 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10600
10601 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10602
10603 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10604
10605 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10606
10607 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10608
10609 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10610 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10611
10612 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10613 NegDivScale0, Mul, DivScale1);
10614
10615 SDValue Scale;
10616
10617 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10618 // Workaround a hardware bug on SI where the condition output from div_scale
10619 // is not usable.
10620
10621 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10622
10623 // Figure out if the scale to use for div_fmas.
10624 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10625 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10626 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10627 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10628
10629 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10630 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10631
10632 SDValue Scale0Hi
10633 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10634 SDValue Scale1Hi
10635 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10636
10637 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10638 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10639 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10640 } else {
10641 Scale = DivScale1.getValue(1);
10642 }
10643
10644 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10645 Fma4, Fma3, Mul, Scale);
10646
10647 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10648}
10649
10650SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10651 EVT VT = Op.getValueType();
10652
10653 if (VT == MVT::f32)
10654 return LowerFDIV32(Op, DAG);
10655
10656 if (VT == MVT::f64)
10657 return LowerFDIV64(Op, DAG);
10658
10659 if (VT == MVT::f16)
10660 return LowerFDIV16(Op, DAG);
10661
10662 llvm_unreachable("Unexpected type for fdiv");
10663}
10664
10665SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10666 SDLoc dl(Op);
10667 SDValue Val = Op.getOperand(0);
10668 EVT VT = Val.getValueType();
10669 EVT ResultExpVT = Op->getValueType(1);
10670 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10671
10672 SDValue Mant = DAG.getNode(
10674 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10675
10676 SDValue Exp = DAG.getNode(
10677 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10678 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10679
10680 if (Subtarget->hasFractBug()) {
10681 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10682 SDValue Inf = DAG.getConstantFP(
10684
10685 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10686 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10687 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10688 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10689 }
10690
10691 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10692 return DAG.getMergeValues({Mant, CastExp}, dl);
10693}
10694
10695SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10696 SDLoc DL(Op);
10697 StoreSDNode *Store = cast<StoreSDNode>(Op);
10698 EVT VT = Store->getMemoryVT();
10699
10700 if (VT == MVT::i1) {
10701 return DAG.getTruncStore(Store->getChain(), DL,
10702 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10703 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10704 }
10705
10706 assert(VT.isVector() &&
10707 Store->getValue().getValueType().getScalarType() == MVT::i32);
10708
10709 unsigned AS = Store->getAddressSpace();
10710 if (Subtarget->hasLDSMisalignedBug() &&
10711 AS == AMDGPUAS::FLAT_ADDRESS &&
10712 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10713 return SplitVectorStore(Op, DAG);
10714 }
10715
10718 // If there is a possibility that flat instruction access scratch memory
10719 // then we need to use the same legalization rules we use for private.
10720 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10722 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10724
10725 unsigned NumElements = VT.getVectorNumElements();
10726 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10727 AS == AMDGPUAS::FLAT_ADDRESS) {
10728 if (NumElements > 4)
10729 return SplitVectorStore(Op, DAG);
10730 // v3 stores not supported on SI.
10731 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10732 return SplitVectorStore(Op, DAG);
10733
10735 VT, *Store->getMemOperand()))
10736 return expandUnalignedStore(Store, DAG);
10737
10738 return SDValue();
10739 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10740 switch (Subtarget->getMaxPrivateElementSize()) {
10741 case 4:
10742 return scalarizeVectorStore(Store, DAG);
10743 case 8:
10744 if (NumElements > 2)
10745 return SplitVectorStore(Op, DAG);
10746 return SDValue();
10747 case 16:
10748 if (NumElements > 4 ||
10749 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10750 return SplitVectorStore(Op, DAG);
10751 return SDValue();
10752 default:
10753 llvm_unreachable("unsupported private_element_size");
10754 }
10755 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10756 unsigned Fast = 0;
10757 auto Flags = Store->getMemOperand()->getFlags();
10759 Store->getAlign(), Flags, &Fast) &&
10760 Fast > 1)
10761 return SDValue();
10762
10763 if (VT.isVector())
10764 return SplitVectorStore(Op, DAG);
10765
10766 return expandUnalignedStore(Store, DAG);
10767 }
10768
10769 // Probably an invalid store. If so we'll end up emitting a selection error.
10770 return SDValue();
10771}
10772
10773// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10774SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10775 SDLoc SL(Op);
10776 assert(!Subtarget->has16BitInsts());
10777 SDNodeFlags Flags = Op->getFlags();
10778 SDValue Ext =
10779 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10780
10781 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10782 SDValue Sqrt =
10783 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10784
10785 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10786 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10787}
10788
10789SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10790 SDLoc DL(Op);
10791 SDNodeFlags Flags = Op->getFlags();
10792 MVT VT = Op.getValueType().getSimpleVT();
10793 const SDValue X = Op.getOperand(0);
10794
10795 if (allowApproxFunc(DAG, Flags)) {
10796 // Instruction is 1ulp but ignores denormals.
10797 return DAG.getNode(
10799 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10800 }
10801
10802 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10803 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10804
10805 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10806
10807 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10808
10809 SDValue SqrtX =
10810 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10811
10812 SDValue SqrtS;
10813 if (needsDenormHandlingF32(DAG, X, Flags)) {
10814 SDValue SqrtID =
10815 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10816 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
10817
10818 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10819 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10820 DAG.getConstant(-1, DL, MVT::i32));
10821 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
10822
10823 SDValue NegSqrtSNextDown =
10824 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
10825
10826 SDValue SqrtVP =
10827 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
10828
10829 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10830 DAG.getConstant(1, DL, MVT::i32));
10831 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
10832
10833 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
10834 SDValue SqrtVS =
10835 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
10836
10837 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
10838 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
10839
10840 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
10841 Flags);
10842
10843 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
10844 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
10845 Flags);
10846 } else {
10847 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
10848
10849 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
10850
10851 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
10852 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
10853 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
10854
10855 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
10856 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
10857 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
10858
10859 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
10860 SDValue SqrtD =
10861 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
10862 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
10863 }
10864
10865 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
10866
10867 SDValue ScaledDown =
10868 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
10869
10870 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
10871 SDValue IsZeroOrInf =
10872 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
10873 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
10874
10875 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
10876}
10877
10878SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
10879 // For double type, the SQRT and RSQ instructions don't have required
10880 // precision, we apply Goldschmidt's algorithm to improve the result:
10881 //
10882 // y0 = rsq(x)
10883 // g0 = x * y0
10884 // h0 = 0.5 * y0
10885 //
10886 // r0 = 0.5 - h0 * g0
10887 // g1 = g0 * r0 + g0
10888 // h1 = h0 * r0 + h0
10889 //
10890 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
10891 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
10892 // h2 = h1 * r1 + h1
10893 //
10894 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
10895 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
10896 //
10897 // sqrt(x) = g3
10898
10899 SDNodeFlags Flags = Op->getFlags();
10900
10901 SDLoc DL(Op);
10902
10903 SDValue X = Op.getOperand(0);
10904 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
10905
10906 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
10907
10908 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
10909
10910 // Scale up input if it is too small.
10911 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
10912 SDValue ScaleUp =
10913 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
10914 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
10915
10916 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
10917
10918 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
10919
10920 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
10921 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
10922
10923 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
10924 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
10925
10926 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
10927
10928 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
10929
10930 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
10931 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
10932
10933 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
10934
10935 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
10936 SDValue SqrtD1 =
10937 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
10938
10939 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
10940
10941 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
10942 SDValue ScaleDown =
10943 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
10944 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
10945
10946 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
10947 // with finite only or nsz because rsq(+/-0) = +/-inf
10948
10949 // TODO: Check for DAZ and expand to subnormals
10950 SDValue IsZeroOrInf =
10951 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
10952 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
10953
10954 // If x is +INF, +0, or -0, use its original value
10955 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
10956 Flags);
10957}
10958
10959SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
10960 SDLoc DL(Op);
10961 EVT VT = Op.getValueType();
10962 SDValue Arg = Op.getOperand(0);
10963 SDValue TrigVal;
10964
10965 // Propagate fast-math flags so that the multiply we introduce can be folded
10966 // if Arg is already the result of a multiply by constant.
10967 auto Flags = Op->getFlags();
10968
10969 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
10970
10971 if (Subtarget->hasTrigReducedRange()) {
10972 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
10973 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
10974 } else {
10975 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
10976 }
10977
10978 switch (Op.getOpcode()) {
10979 case ISD::FCOS:
10980 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
10981 case ISD::FSIN:
10982 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
10983 default:
10984 llvm_unreachable("Wrong trig opcode");
10985 }
10986}
10987
10988SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
10989 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
10990 assert(AtomicNode->isCompareAndSwap());
10991 unsigned AS = AtomicNode->getAddressSpace();
10992
10993 // No custom lowering required for local address space
10995 return Op;
10996
10997 // Non-local address space requires custom lowering for atomic compare
10998 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
10999 SDLoc DL(Op);
11000 SDValue ChainIn = Op.getOperand(0);
11001 SDValue Addr = Op.getOperand(1);
11002 SDValue Old = Op.getOperand(2);
11003 SDValue New = Op.getOperand(3);
11004 EVT VT = Op.getValueType();
11005 MVT SimpleVT = VT.getSimpleVT();
11006 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11007
11008 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11009 SDValue Ops[] = { ChainIn, Addr, NewOld };
11010
11011 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11012 Ops, VT, AtomicNode->getMemOperand());
11013}
11014
11015//===----------------------------------------------------------------------===//
11016// Custom DAG optimizations
11017//===----------------------------------------------------------------------===//
11018
11019SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11020 DAGCombinerInfo &DCI) const {
11021 EVT VT = N->getValueType(0);
11022 EVT ScalarVT = VT.getScalarType();
11023 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11024 return SDValue();
11025
11026 SelectionDAG &DAG = DCI.DAG;
11027 SDLoc DL(N);
11028
11029 SDValue Src = N->getOperand(0);
11030 EVT SrcVT = Src.getValueType();
11031
11032 // TODO: We could try to match extracting the higher bytes, which would be
11033 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11034 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11035 // about in practice.
11036 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11037 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11038 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11039 DCI.AddToWorklist(Cvt.getNode());
11040
11041 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11042 if (ScalarVT != MVT::f32) {
11043 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11044 DAG.getTargetConstant(0, DL, MVT::i32));
11045 }
11046 return Cvt;
11047 }
11048 }
11049
11050 return SDValue();
11051}
11052
11053SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11054 DAGCombinerInfo &DCI) const {
11055 SDValue MagnitudeOp = N->getOperand(0);
11056 SDValue SignOp = N->getOperand(1);
11057 SelectionDAG &DAG = DCI.DAG;
11058 SDLoc DL(N);
11059
11060 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11061 // lower half with a copy.
11062 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11063 if (MagnitudeOp.getValueType() == MVT::f64) {
11064 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11065 SDValue MagLo =
11066 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11067 DAG.getConstant(0, DL, MVT::i32));
11068 SDValue MagHi =
11069 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11070 DAG.getConstant(1, DL, MVT::i32));
11071
11072 SDValue HiOp =
11073 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11074
11075 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11076
11077 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11078 }
11079
11080 if (SignOp.getValueType() != MVT::f64)
11081 return SDValue();
11082
11083 // Reduce width of sign operand, we only need the highest bit.
11084 //
11085 // fcopysign f64:x, f64:y ->
11086 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11087 // TODO: In some cases it might make sense to go all the way to f16.
11088 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11089 SDValue SignAsF32 =
11090 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11091 DAG.getConstant(1, DL, MVT::i32));
11092
11093 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11094 SignAsF32);
11095}
11096
11097// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11098// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11099// bits
11100
11101// This is a variant of
11102// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11103//
11104// The normal DAG combiner will do this, but only if the add has one use since
11105// that would increase the number of instructions.
11106//
11107// This prevents us from seeing a constant offset that can be folded into a
11108// memory instruction's addressing mode. If we know the resulting add offset of
11109// a pointer can be folded into an addressing offset, we can replace the pointer
11110// operand with the add of new constant offset. This eliminates one of the uses,
11111// and may allow the remaining use to also be simplified.
11112//
11113SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11114 unsigned AddrSpace,
11115 EVT MemVT,
11116 DAGCombinerInfo &DCI) const {
11117 SDValue N0 = N->getOperand(0);
11118 SDValue N1 = N->getOperand(1);
11119
11120 // We only do this to handle cases where it's profitable when there are
11121 // multiple uses of the add, so defer to the standard combine.
11122 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11123 N0->hasOneUse())
11124 return SDValue();
11125
11126 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11127 if (!CN1)
11128 return SDValue();
11129
11130 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11131 if (!CAdd)
11132 return SDValue();
11133
11134 SelectionDAG &DAG = DCI.DAG;
11135
11136 if (N0->getOpcode() == ISD::OR &&
11137 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11138 return SDValue();
11139
11140 // If the resulting offset is too large, we can't fold it into the
11141 // addressing mode offset.
11142 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11143 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11144
11145 AddrMode AM;
11146 AM.HasBaseReg = true;
11147 AM.BaseOffs = Offset.getSExtValue();
11148 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11149 return SDValue();
11150
11151 SDLoc SL(N);
11152 EVT VT = N->getValueType(0);
11153
11154 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11155 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11156
11158 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11159 (N0.getOpcode() == ISD::OR ||
11160 N0->getFlags().hasNoUnsignedWrap()));
11161
11162 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11163}
11164
11165/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11166/// by the chain and intrinsic ID. Theoretically we would also need to check the
11167/// specific intrinsic, but they all place the pointer operand first.
11168static unsigned getBasePtrIndex(const MemSDNode *N) {
11169 switch (N->getOpcode()) {
11170 case ISD::STORE:
11173 return 2;
11174 default:
11175 return 1;
11176 }
11177}
11178
11179SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11180 DAGCombinerInfo &DCI) const {
11181 SelectionDAG &DAG = DCI.DAG;
11182 SDLoc SL(N);
11183
11184 unsigned PtrIdx = getBasePtrIndex(N);
11185 SDValue Ptr = N->getOperand(PtrIdx);
11186
11187 // TODO: We could also do this for multiplies.
11188 if (Ptr.getOpcode() == ISD::SHL) {
11189 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11190 N->getMemoryVT(), DCI);
11191 if (NewPtr) {
11192 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11193
11194 NewOps[PtrIdx] = NewPtr;
11195 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11196 }
11197 }
11198
11199 return SDValue();
11200}
11201
11202static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11203 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11204 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11205 (Opc == ISD::XOR && Val == 0);
11206}
11207
11208// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11209// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11210// integer combine opportunities since most 64-bit operations are decomposed
11211// this way. TODO: We won't want this for SALU especially if it is an inline
11212// immediate.
11213SDValue SITargetLowering::splitBinaryBitConstantOp(
11214 DAGCombinerInfo &DCI,
11215 const SDLoc &SL,
11216 unsigned Opc, SDValue LHS,
11217 const ConstantSDNode *CRHS) const {
11218 uint64_t Val = CRHS->getZExtValue();
11219 uint32_t ValLo = Lo_32(Val);
11220 uint32_t ValHi = Hi_32(Val);
11222
11223 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11224 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11225 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11226 // If we need to materialize a 64-bit immediate, it will be split up later
11227 // anyway. Avoid creating the harder to understand 64-bit immediate
11228 // materialization.
11229 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11230 }
11231
11232 return SDValue();
11233}
11234
11236 if (V.getValueType() != MVT::i1)
11237 return false;
11238 switch (V.getOpcode()) {
11239 default:
11240 break;
11241 case ISD::SETCC:
11243 return true;
11244 case ISD::AND:
11245 case ISD::OR:
11246 case ISD::XOR:
11247 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11248 }
11249 return false;
11250}
11251
11252// If a constant has all zeroes or all ones within each byte return it.
11253// Otherwise return 0.
11255 // 0xff for any zero byte in the mask
11256 uint32_t ZeroByteMask = 0;
11257 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11258 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11259 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11260 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11261 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11262 if ((NonZeroByteMask & C) != NonZeroByteMask)
11263 return 0; // Partial bytes selected.
11264 return C;
11265}
11266
11267// Check if a node selects whole bytes from its operand 0 starting at a byte
11268// boundary while masking the rest. Returns select mask as in the v_perm_b32
11269// or -1 if not succeeded.
11270// Note byte select encoding:
11271// value 0-3 selects corresponding source byte;
11272// value 0xc selects zero;
11273// value 0xff selects 0xff.
11275 assert(V.getValueSizeInBits() == 32);
11276
11277 if (V.getNumOperands() != 2)
11278 return ~0;
11279
11280 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11281 if (!N1)
11282 return ~0;
11283
11284 uint32_t C = N1->getZExtValue();
11285
11286 switch (V.getOpcode()) {
11287 default:
11288 break;
11289 case ISD::AND:
11290 if (uint32_t ConstMask = getConstantPermuteMask(C))
11291 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11292 break;
11293
11294 case ISD::OR:
11295 if (uint32_t ConstMask = getConstantPermuteMask(C))
11296 return (0x03020100 & ~ConstMask) | ConstMask;
11297 break;
11298
11299 case ISD::SHL:
11300 if (C % 8)
11301 return ~0;
11302
11303 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11304
11305 case ISD::SRL:
11306 if (C % 8)
11307 return ~0;
11308
11309 return uint32_t(0x0c0c0c0c03020100ull >> C);
11310 }
11311
11312 return ~0;
11313}
11314
11315SDValue SITargetLowering::performAndCombine(SDNode *N,
11316 DAGCombinerInfo &DCI) const {
11317 if (DCI.isBeforeLegalize())
11318 return SDValue();
11319
11320 SelectionDAG &DAG = DCI.DAG;
11321 EVT VT = N->getValueType(0);
11322 SDValue LHS = N->getOperand(0);
11323 SDValue RHS = N->getOperand(1);
11324
11325
11326 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11327 if (VT == MVT::i64 && CRHS) {
11328 if (SDValue Split
11329 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11330 return Split;
11331 }
11332
11333 if (CRHS && VT == MVT::i32) {
11334 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11335 // nb = number of trailing zeroes in mask
11336 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11337 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11338 uint64_t Mask = CRHS->getZExtValue();
11339 unsigned Bits = llvm::popcount(Mask);
11340 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11341 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11342 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11343 unsigned Shift = CShift->getZExtValue();
11344 unsigned NB = CRHS->getAPIntValue().countr_zero();
11345 unsigned Offset = NB + Shift;
11346 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11347 SDLoc SL(N);
11348 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11349 LHS->getOperand(0),
11350 DAG.getConstant(Offset, SL, MVT::i32),
11351 DAG.getConstant(Bits, SL, MVT::i32));
11352 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11353 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11354 DAG.getValueType(NarrowVT));
11355 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11356 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11357 return Shl;
11358 }
11359 }
11360 }
11361
11362 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11363 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11364 isa<ConstantSDNode>(LHS.getOperand(2))) {
11365 uint32_t Sel = getConstantPermuteMask(Mask);
11366 if (!Sel)
11367 return SDValue();
11368
11369 // Select 0xc for all zero bytes
11370 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11371 SDLoc DL(N);
11372 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11373 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11374 }
11375 }
11376
11377 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11378 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11379 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11380 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11381 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11382
11383 SDValue X = LHS.getOperand(0);
11384 SDValue Y = RHS.getOperand(0);
11385 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11386 !isTypeLegal(X.getValueType()))
11387 return SDValue();
11388
11389 if (LCC == ISD::SETO) {
11390 if (X != LHS.getOperand(1))
11391 return SDValue();
11392
11393 if (RCC == ISD::SETUNE) {
11394 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11395 if (!C1 || !C1->isInfinity() || C1->isNegative())
11396 return SDValue();
11397
11404
11405 static_assert(((~(SIInstrFlags::S_NAN |
11408 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11409 "mask not equal");
11410
11411 SDLoc DL(N);
11412 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11413 X, DAG.getConstant(Mask, DL, MVT::i32));
11414 }
11415 }
11416 }
11417
11418 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11419 std::swap(LHS, RHS);
11420
11421 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11422 RHS.hasOneUse()) {
11423 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11424 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11425 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11426 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11427 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11428 (RHS.getOperand(0) == LHS.getOperand(0) &&
11429 LHS.getOperand(0) == LHS.getOperand(1))) {
11430 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11431 unsigned NewMask = LCC == ISD::SETO ?
11432 Mask->getZExtValue() & ~OrdMask :
11433 Mask->getZExtValue() & OrdMask;
11434
11435 SDLoc DL(N);
11436 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11437 DAG.getConstant(NewMask, DL, MVT::i32));
11438 }
11439 }
11440
11441 if (VT == MVT::i32 &&
11442 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11443 // and x, (sext cc from i1) => select cc, x, 0
11444 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11445 std::swap(LHS, RHS);
11446 if (isBoolSGPR(RHS.getOperand(0)))
11447 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11448 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11449 }
11450
11451 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11453 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11454 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11455 uint32_t LHSMask = getPermuteMask(LHS);
11456 uint32_t RHSMask = getPermuteMask(RHS);
11457 if (LHSMask != ~0u && RHSMask != ~0u) {
11458 // Canonicalize the expression in an attempt to have fewer unique masks
11459 // and therefore fewer registers used to hold the masks.
11460 if (LHSMask > RHSMask) {
11461 std::swap(LHSMask, RHSMask);
11462 std::swap(LHS, RHS);
11463 }
11464
11465 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11466 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11467 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11468 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11469
11470 // Check of we need to combine values from two sources within a byte.
11471 if (!(LHSUsedLanes & RHSUsedLanes) &&
11472 // If we select high and lower word keep it for SDWA.
11473 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11474 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11475 // Each byte in each mask is either selector mask 0-3, or has higher
11476 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11477 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11478 // mask which is not 0xff wins. By anding both masks we have a correct
11479 // result except that 0x0c shall be corrected to give 0x0c only.
11480 uint32_t Mask = LHSMask & RHSMask;
11481 for (unsigned I = 0; I < 32; I += 8) {
11482 uint32_t ByteSel = 0xff << I;
11483 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11484 Mask &= (0x0c << I) & 0xffffffff;
11485 }
11486
11487 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11488 // or 0x0c.
11489 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11490 SDLoc DL(N);
11491
11492 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11493 LHS.getOperand(0), RHS.getOperand(0),
11494 DAG.getConstant(Sel, DL, MVT::i32));
11495 }
11496 }
11497 }
11498
11499 return SDValue();
11500}
11501
11502// A key component of v_perm is a mapping between byte position of the src
11503// operands, and the byte position of the dest. To provide such, we need: 1. the
11504// node that provides x byte of the dest of the OR, and 2. the byte of the node
11505// used to provide that x byte. calculateByteProvider finds which node provides
11506// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11507// and finds an ultimate src and byte position For example: The supported
11508// LoadCombine pattern for vector loads is as follows
11509// t1
11510// or
11511// / \
11512// t2 t3
11513// zext shl
11514// | | \
11515// t4 t5 16
11516// or anyext
11517// / \ |
11518// t6 t7 t8
11519// srl shl or
11520// / | / \ / \
11521// t9 t10 t11 t12 t13 t14
11522// trunc* 8 trunc* 8 and and
11523// | | / | | \
11524// t15 t16 t17 t18 t19 t20
11525// trunc* 255 srl -256
11526// | / \
11527// t15 t15 16
11528//
11529// *In this example, the truncs are from i32->i16
11530//
11531// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11532// respectively. calculateSrcByte would find (given node) -> ultimate src &
11533// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11534// After finding the mapping, we can combine the tree into vperm t15, t16,
11535// 0x05000407
11536
11537// Find the source and byte position from a node.
11538// \p DestByte is the byte position of the dest of the or that the src
11539// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11540// dest of the or byte. \p Depth tracks how many recursive iterations we have
11541// performed.
11542static const std::optional<ByteProvider<SDValue>>
11543calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11544 unsigned Depth = 0) {
11545 // We may need to recursively traverse a series of SRLs
11546 if (Depth >= 6)
11547 return std::nullopt;
11548
11549 auto ValueSize = Op.getValueSizeInBits();
11550 if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
11551 return std::nullopt;
11552
11553 switch (Op->getOpcode()) {
11554 case ISD::TRUNCATE: {
11555 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11556 }
11557
11558 case ISD::SIGN_EXTEND:
11559 case ISD::ZERO_EXTEND:
11561 SDValue NarrowOp = Op->getOperand(0);
11562 auto NarrowVT = NarrowOp.getValueType();
11563 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11564 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11565 NarrowVT = VTSign->getVT();
11566 }
11567 if (!NarrowVT.isByteSized())
11568 return std::nullopt;
11569 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11570
11571 if (SrcIndex >= NarrowByteWidth)
11572 return std::nullopt;
11573 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11574 }
11575
11576 case ISD::SRA:
11577 case ISD::SRL: {
11578 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11579 if (!ShiftOp)
11580 return std::nullopt;
11581
11582 uint64_t BitShift = ShiftOp->getZExtValue();
11583
11584 if (BitShift % 8 != 0)
11585 return std::nullopt;
11586
11587 SrcIndex += BitShift / 8;
11588
11589 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11590 }
11591
11592 default: {
11593 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11594 }
11595 }
11596 llvm_unreachable("fully handled switch");
11597}
11598
11599// For a byte position in the result of an Or, traverse the tree and find the
11600// node (and the byte of the node) which ultimately provides this {Or,
11601// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11602// the byte position of the Op that corresponds with the originally requested
11603// byte of the Or \p Depth tracks how many recursive iterations we have
11604// performed. \p StartingIndex is the originally requested byte of the Or
11605static const std::optional<ByteProvider<SDValue>>
11606calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11607 unsigned StartingIndex = 0) {
11608 // Finding Src tree of RHS of or typically requires at least 1 additional
11609 // depth
11610 if (Depth > 6)
11611 return std::nullopt;
11612
11613 unsigned BitWidth = Op.getScalarValueSizeInBits();
11614 if (BitWidth % 8 != 0)
11615 return std::nullopt;
11616 if (Index > BitWidth / 8 - 1)
11617 return std::nullopt;
11618
11619 switch (Op.getOpcode()) {
11620 case ISD::OR: {
11621 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11622 StartingIndex);
11623 if (!RHS)
11624 return std::nullopt;
11625 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11626 StartingIndex);
11627 if (!LHS)
11628 return std::nullopt;
11629 // A well formed Or will have two ByteProviders for each byte, one of which
11630 // is constant zero
11631 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11632 return std::nullopt;
11633 if (!LHS || LHS->isConstantZero())
11634 return RHS;
11635 if (!RHS || RHS->isConstantZero())
11636 return LHS;
11637 return std::nullopt;
11638 }
11639
11640 case ISD::AND: {
11641 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11642 if (!BitMaskOp)
11643 return std::nullopt;
11644
11645 uint32_t BitMask = BitMaskOp->getZExtValue();
11646 // Bits we expect for our StartingIndex
11647 uint32_t IndexMask = 0xFF << (Index * 8);
11648
11649 if ((IndexMask & BitMask) != IndexMask) {
11650 // If the result of the and partially provides the byte, then it
11651 // is not well formatted
11652 if (IndexMask & BitMask)
11653 return std::nullopt;
11655 }
11656
11657 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11658 }
11659
11660 case ISD::FSHR: {
11661 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11662 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11663 if (!ShiftOp || Op.getValueType().isVector())
11664 return std::nullopt;
11665
11666 uint64_t BitsProvided = Op.getValueSizeInBits();
11667 if (BitsProvided % 8 != 0)
11668 return std::nullopt;
11669
11670 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11671 if (BitShift % 8)
11672 return std::nullopt;
11673
11674 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11675 uint64_t ByteShift = BitShift / 8;
11676
11677 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11678 uint64_t BytesProvided = BitsProvided / 8;
11679 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11680 NewIndex %= BytesProvided;
11681 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11682 }
11683
11684 case ISD::SRA:
11685 case ISD::SRL: {
11686 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11687 if (!ShiftOp)
11688 return std::nullopt;
11689
11690 uint64_t BitShift = ShiftOp->getZExtValue();
11691 if (BitShift % 8)
11692 return std::nullopt;
11693
11694 auto BitsProvided = Op.getScalarValueSizeInBits();
11695 if (BitsProvided % 8 != 0)
11696 return std::nullopt;
11697
11698 uint64_t BytesProvided = BitsProvided / 8;
11699 uint64_t ByteShift = BitShift / 8;
11700 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11701 // If the byte we are trying to provide (as tracked by index) falls in this
11702 // range, then the SRL provides the byte. The byte of interest of the src of
11703 // the SRL is Index + ByteShift
11704 return BytesProvided - ByteShift > Index
11705 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11706 Index + ByteShift)
11708 }
11709
11710 case ISD::SHL: {
11711 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11712 if (!ShiftOp)
11713 return std::nullopt;
11714
11715 uint64_t BitShift = ShiftOp->getZExtValue();
11716 if (BitShift % 8 != 0)
11717 return std::nullopt;
11718 uint64_t ByteShift = BitShift / 8;
11719
11720 // If we are shifting by an amount greater than (or equal to)
11721 // the index we are trying to provide, then it provides 0s. If not,
11722 // then this bytes are not definitively 0s, and the corresponding byte
11723 // of interest is Index - ByteShift of the src
11724 return Index < ByteShift
11726 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11727 Depth + 1, StartingIndex);
11728 }
11729 case ISD::ANY_EXTEND:
11730 case ISD::SIGN_EXTEND:
11731 case ISD::ZERO_EXTEND:
11733 case ISD::AssertZext:
11734 case ISD::AssertSext: {
11735 SDValue NarrowOp = Op->getOperand(0);
11736 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11737 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11738 Op->getOpcode() == ISD::AssertZext ||
11739 Op->getOpcode() == ISD::AssertSext) {
11740 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11741 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11742 }
11743 if (NarrowBitWidth % 8 != 0)
11744 return std::nullopt;
11745 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11746
11747 if (Index >= NarrowByteWidth)
11748 return Op.getOpcode() == ISD::ZERO_EXTEND
11749 ? std::optional<ByteProvider<SDValue>>(
11751 : std::nullopt;
11752 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11753 }
11754
11755 case ISD::TRUNCATE: {
11756 uint64_t NarrowByteWidth = BitWidth / 8;
11757
11758 if (NarrowByteWidth >= Index) {
11759 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11760 StartingIndex);
11761 }
11762
11763 return std::nullopt;
11764 }
11765
11766 case ISD::CopyFromReg: {
11767 if (BitWidth / 8 > Index)
11768 return calculateSrcByte(Op, StartingIndex, Index);
11769
11770 return std::nullopt;
11771 }
11772
11773 case ISD::LOAD: {
11774 auto L = cast<LoadSDNode>(Op.getNode());
11775
11776 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11777 if (NarrowBitWidth % 8 != 0)
11778 return std::nullopt;
11779 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11780
11781 // If the width of the load does not reach byte we are trying to provide for
11782 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11783 // question
11784 if (Index >= NarrowByteWidth) {
11785 return L->getExtensionType() == ISD::ZEXTLOAD
11786 ? std::optional<ByteProvider<SDValue>>(
11788 : std::nullopt;
11789 }
11790
11791 if (NarrowByteWidth > Index) {
11792 return calculateSrcByte(Op, StartingIndex, Index);
11793 }
11794
11795 return std::nullopt;
11796 }
11797
11798 case ISD::BSWAP:
11799 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
11800 Depth + 1, StartingIndex);
11801
11803 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11804 if (!IdxOp)
11805 return std::nullopt;
11806 auto VecIdx = IdxOp->getZExtValue();
11807 auto ScalarSize = Op.getScalarValueSizeInBits();
11808 if (ScalarSize != 32) {
11809 if ((VecIdx + 1) * ScalarSize > 32)
11810 return std::nullopt;
11811 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11812 }
11813
11814 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
11815 StartingIndex, Index);
11816 }
11817
11818 case AMDGPUISD::PERM: {
11819 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11820 if (!PermMask)
11821 return std::nullopt;
11822
11823 auto IdxMask =
11824 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
11825 if (IdxMask > 0x07 && IdxMask != 0x0c)
11826 return std::nullopt;
11827
11828 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11829 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11830
11831 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
11834 }
11835
11836 default: {
11837 return std::nullopt;
11838 }
11839 }
11840
11841 llvm_unreachable("fully handled switch");
11842}
11843
11844// Returns true if the Operand is a scalar and is 16 bits
11845static bool isExtendedFrom16Bits(SDValue &Operand) {
11846
11847 switch (Operand.getOpcode()) {
11848 case ISD::ANY_EXTEND:
11849 case ISD::SIGN_EXTEND:
11850 case ISD::ZERO_EXTEND: {
11851 auto OpVT = Operand.getOperand(0).getValueType();
11852 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
11853 }
11854 case ISD::LOAD: {
11855 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
11856 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
11857 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
11858 ExtType == ISD::EXTLOAD) {
11859 auto MemVT = L->getMemoryVT();
11860 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
11861 }
11862 return L->getMemoryVT().getSizeInBits() == 16;
11863 }
11864 default:
11865 return false;
11866 }
11867}
11868
11869// Returns true if the mask matches consecutive bytes, and the first byte
11870// begins at a power of 2 byte offset from 0th byte
11871static bool addresses16Bits(int Mask) {
11872 int Low8 = Mask & 0xff;
11873 int Hi8 = (Mask & 0xff00) >> 8;
11874
11875 assert(Low8 < 8 && Hi8 < 8);
11876 // Are the bytes contiguous in the order of increasing addresses.
11877 bool IsConsecutive = (Hi8 - Low8 == 1);
11878 // Is the first byte at location that is aligned for 16 bit instructions.
11879 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
11880 // In this case, we still need code to extract the 16 bit operand, so it
11881 // is better to use i8 v_perm
11882 bool Is16Aligned = !(Low8 % 2);
11883
11884 return IsConsecutive && Is16Aligned;
11885}
11886
11887// Do not lower into v_perm if the operands are actually 16 bit
11888// and the selected bits (based on PermMask) correspond with two
11889// easily addressable 16 bit operands.
11891 SDValue &OtherOp) {
11892 int Low16 = PermMask & 0xffff;
11893 int Hi16 = (PermMask & 0xffff0000) >> 16;
11894
11895 assert(Op.getValueType().isByteSized());
11896 assert(OtherOp.getValueType().isByteSized());
11897
11898 auto TempOp = peekThroughBitcasts(Op);
11899 auto TempOtherOp = peekThroughBitcasts(OtherOp);
11900
11901 auto OpIs16Bit =
11902 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
11903 if (!OpIs16Bit)
11904 return true;
11905
11906 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
11907 isExtendedFrom16Bits(TempOtherOp);
11908 if (!OtherOpIs16Bit)
11909 return true;
11910
11911 // Do we cleanly address both
11912 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
11913}
11914
11916 SelectionDAG &DAG = DCI.DAG;
11917 EVT VT = N->getValueType(0);
11918
11919 if (VT != MVT::i32)
11920 return SDValue();
11921
11922 // VT is known to be MVT::i32, so we need to provide 4 bytes.
11924 for (int i = 0; i < 4; i++) {
11925 // Find the ByteProvider that provides the ith byte of the result of OR
11926 std::optional<ByteProvider<SDValue>> P =
11927 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
11928 // TODO support constantZero
11929 if (!P || P->isConstantZero())
11930 return SDValue();
11931
11932 PermNodes.push_back(*P);
11933 }
11934 if (PermNodes.size() != 4)
11935 return SDValue();
11936
11937 int FirstSrc = 0;
11938 std::optional<int> SecondSrc;
11939 uint64_t PermMask = 0x00000000;
11940 for (size_t i = 0; i < PermNodes.size(); i++) {
11941 auto PermOp = PermNodes[i];
11942 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11943 // by sizeof(Src2) = 4
11944 int SrcByteAdjust = 4;
11945
11946 if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11947 if (SecondSrc.has_value())
11948 if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11949 return SDValue();
11950
11951 // Set the index of the second distinct Src node
11952 SecondSrc = i;
11953 assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11954 SrcByteAdjust = 0;
11955 }
11956 assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11958 PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11959 }
11960
11961 SDValue Op = *PermNodes[FirstSrc].Src;
11962 SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11963 : *PermNodes[FirstSrc].Src;
11964
11965 // Check that we haven't just recreated the same FSHR node.
11966 if (N->getOpcode() == ISD::FSHR &&
11967 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
11968 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
11969 return SDValue();
11970
11971 // Check that we are not just extracting the bytes in order from an op
11972 if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
11973 int Low16 = PermMask & 0xffff;
11974 int Hi16 = (PermMask & 0xffff0000) >> 16;
11975
11976 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11977 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11978
11979 // The perm op would really just produce Op. So combine into Op
11980 if (WellFormedLow && WellFormedHi)
11981 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11982 }
11983
11984 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11985 SDLoc DL(N);
11986 assert(Op.getValueType().isByteSized() &&
11987 OtherOp.getValueType().isByteSized());
11988
11989 // If the ultimate src is less than 32 bits, then we will only be
11990 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
11991 // CalculateByteProvider would not have returned Op as source if we
11992 // used a byte that is outside its ValueType. Thus, we are free to
11993 // ANY_EXTEND as the extended bits are dont-cares.
11994 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
11995 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
11996
11997 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11998 DAG.getConstant(PermMask, DL, MVT::i32));
11999 }
12000
12001 return SDValue();
12002}
12003
12004SDValue SITargetLowering::performOrCombine(SDNode *N,
12005 DAGCombinerInfo &DCI) const {
12006 SelectionDAG &DAG = DCI.DAG;
12007 SDValue LHS = N->getOperand(0);
12008 SDValue RHS = N->getOperand(1);
12009
12010 EVT VT = N->getValueType(0);
12011 if (VT == MVT::i1) {
12012 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12013 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12014 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12015 SDValue Src = LHS.getOperand(0);
12016 if (Src != RHS.getOperand(0))
12017 return SDValue();
12018
12019 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12020 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12021 if (!CLHS || !CRHS)
12022 return SDValue();
12023
12024 // Only 10 bits are used.
12025 static const uint32_t MaxMask = 0x3ff;
12026
12027 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12028 SDLoc DL(N);
12029 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12030 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12031 }
12032
12033 return SDValue();
12034 }
12035
12036 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12037 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12038 LHS.getOpcode() == AMDGPUISD::PERM &&
12039 isa<ConstantSDNode>(LHS.getOperand(2))) {
12040 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12041 if (!Sel)
12042 return SDValue();
12043
12044 Sel |= LHS.getConstantOperandVal(2);
12045 SDLoc DL(N);
12046 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12047 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12048 }
12049
12050 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12052 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12053 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12054
12055 // If all the uses of an or need to extract the individual elements, do not
12056 // attempt to lower into v_perm
12057 auto usesCombinedOperand = [](SDNode *OrUse) {
12058 // If we have any non-vectorized use, then it is a candidate for v_perm
12059 if (OrUse->getOpcode() != ISD::BITCAST ||
12060 !OrUse->getValueType(0).isVector())
12061 return true;
12062
12063 // If we have any non-vectorized use, then it is a candidate for v_perm
12064 for (auto VUse : OrUse->uses()) {
12065 if (!VUse->getValueType(0).isVector())
12066 return true;
12067
12068 // If the use of a vector is a store, then combining via a v_perm
12069 // is beneficial.
12070 // TODO -- whitelist more uses
12071 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12072 if (VUse->getOpcode() == VectorwiseOp)
12073 return true;
12074 }
12075 return false;
12076 };
12077
12078 if (!any_of(N->uses(), usesCombinedOperand))
12079 return SDValue();
12080
12081 uint32_t LHSMask = getPermuteMask(LHS);
12082 uint32_t RHSMask = getPermuteMask(RHS);
12083
12084 if (LHSMask != ~0u && RHSMask != ~0u) {
12085 // Canonicalize the expression in an attempt to have fewer unique masks
12086 // and therefore fewer registers used to hold the masks.
12087 if (LHSMask > RHSMask) {
12088 std::swap(LHSMask, RHSMask);
12089 std::swap(LHS, RHS);
12090 }
12091
12092 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12093 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12094 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12095 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12096
12097 // Check of we need to combine values from two sources within a byte.
12098 if (!(LHSUsedLanes & RHSUsedLanes) &&
12099 // If we select high and lower word keep it for SDWA.
12100 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12101 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12102 // Kill zero bytes selected by other mask. Zero value is 0xc.
12103 LHSMask &= ~RHSUsedLanes;
12104 RHSMask &= ~LHSUsedLanes;
12105 // Add 4 to each active LHS lane
12106 LHSMask |= LHSUsedLanes & 0x04040404;
12107 // Combine masks
12108 uint32_t Sel = LHSMask | RHSMask;
12109 SDLoc DL(N);
12110
12111 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12112 LHS.getOperand(0), RHS.getOperand(0),
12113 DAG.getConstant(Sel, DL, MVT::i32));
12114 }
12115 }
12116 if (LHSMask == ~0u || RHSMask == ~0u) {
12117 if (SDValue Perm = matchPERM(N, DCI))
12118 return Perm;
12119 }
12120 }
12121
12122 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12123 return SDValue();
12124
12125 // TODO: This could be a generic combine with a predicate for extracting the
12126 // high half of an integer being free.
12127
12128 // (or i64:x, (zero_extend i32:y)) ->
12129 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12130 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12131 RHS.getOpcode() != ISD::ZERO_EXTEND)
12132 std::swap(LHS, RHS);
12133
12134 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12135 SDValue ExtSrc = RHS.getOperand(0);
12136 EVT SrcVT = ExtSrc.getValueType();
12137 if (SrcVT == MVT::i32) {
12138 SDLoc SL(N);
12139 SDValue LowLHS, HiBits;
12140 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12141 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12142
12143 DCI.AddToWorklist(LowOr.getNode());
12144 DCI.AddToWorklist(HiBits.getNode());
12145
12146 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12147 LowOr, HiBits);
12148 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12149 }
12150 }
12151
12152 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12153 if (CRHS) {
12154 if (SDValue Split
12155 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12156 N->getOperand(0), CRHS))
12157 return Split;
12158 }
12159
12160 return SDValue();
12161}
12162
12163SDValue SITargetLowering::performXorCombine(SDNode *N,
12164 DAGCombinerInfo &DCI) const {
12165 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12166 return RV;
12167
12168 SDValue LHS = N->getOperand(0);
12169 SDValue RHS = N->getOperand(1);
12170
12171 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12172 SelectionDAG &DAG = DCI.DAG;
12173
12174 EVT VT = N->getValueType(0);
12175 if (CRHS && VT == MVT::i64) {
12176 if (SDValue Split
12177 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12178 return Split;
12179 }
12180
12181 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12182 // fneg-like xors into 64-bit select.
12183 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12184 // This looks like an fneg, try to fold as a source modifier.
12185 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12186 shouldFoldFNegIntoSrc(N, LHS)) {
12187 // xor (select c, a, b), 0x80000000 ->
12188 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12189 SDLoc DL(N);
12190 SDValue CastLHS =
12191 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12192 SDValue CastRHS =
12193 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12194 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12195 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12196 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12197 LHS->getOperand(0), FNegLHS, FNegRHS);
12198 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12199 }
12200 }
12201
12202 return SDValue();
12203}
12204
12205SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12206 DAGCombinerInfo &DCI) const {
12207 if (!Subtarget->has16BitInsts() ||
12208 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12209 return SDValue();
12210
12211 EVT VT = N->getValueType(0);
12212 if (VT != MVT::i32)
12213 return SDValue();
12214
12215 SDValue Src = N->getOperand(0);
12216 if (Src.getValueType() != MVT::i16)
12217 return SDValue();
12218
12219 return SDValue();
12220}
12221
12222SDValue
12223SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12224 DAGCombinerInfo &DCI) const {
12225 SDValue Src = N->getOperand(0);
12226 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12227
12228 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12229 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12230 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12231 VTSign->getVT() == MVT::i8) ||
12232 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12233 VTSign->getVT() == MVT::i16))) {
12234 assert(Subtarget->hasScalarSubwordLoads() &&
12235 "s_buffer_load_{u8, i8} are supported "
12236 "in GFX12 (or newer) architectures.");
12237 EVT VT = Src.getValueType();
12238 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12241 SDLoc DL(N);
12242 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12243 SDValue Ops[] = {
12244 Src.getOperand(0), // source register
12245 Src.getOperand(1), // offset
12246 Src.getOperand(2) // cachePolicy
12247 };
12248 auto *M = cast<MemSDNode>(Src);
12249 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12250 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12251 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12252 return LoadVal;
12253 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12254 VTSign->getVT() == MVT::i8) ||
12255 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12256 VTSign->getVT() == MVT::i16)) &&
12257 Src.hasOneUse()) {
12258 auto *M = cast<MemSDNode>(Src);
12259 SDValue Ops[] = {
12260 Src.getOperand(0), // Chain
12261 Src.getOperand(1), // rsrc
12262 Src.getOperand(2), // vindex
12263 Src.getOperand(3), // voffset
12264 Src.getOperand(4), // soffset
12265 Src.getOperand(5), // offset
12266 Src.getOperand(6),
12267 Src.getOperand(7)
12268 };
12269 // replace with BUFFER_LOAD_BYTE/SHORT
12270 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12271 Src.getOperand(0).getValueType());
12272 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12274 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12275 ResList,
12276 Ops, M->getMemoryVT(),
12277 M->getMemOperand());
12278 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12279 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12280 }
12281 return SDValue();
12282}
12283
12284SDValue SITargetLowering::performClassCombine(SDNode *N,
12285 DAGCombinerInfo &DCI) const {
12286 SelectionDAG &DAG = DCI.DAG;
12287 SDValue Mask = N->getOperand(1);
12288
12289 // fp_class x, 0 -> false
12290 if (isNullConstant(Mask))
12291 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12292
12293 if (N->getOperand(0).isUndef())
12294 return DAG.getUNDEF(MVT::i1);
12295
12296 return SDValue();
12297}
12298
12299SDValue SITargetLowering::performRcpCombine(SDNode *N,
12300 DAGCombinerInfo &DCI) const {
12301 EVT VT = N->getValueType(0);
12302 SDValue N0 = N->getOperand(0);
12303
12304 if (N0.isUndef()) {
12305 return DCI.DAG.getConstantFP(
12307 VT);
12308 }
12309
12310 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12311 N0.getOpcode() == ISD::SINT_TO_FP)) {
12312 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12313 N->getFlags());
12314 }
12315
12316 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12317 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12318 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12319 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12320 N0.getOperand(0), N->getFlags());
12321 }
12322
12324}
12325
12327 unsigned MaxDepth) const {
12328 unsigned Opcode = Op.getOpcode();
12329 if (Opcode == ISD::FCANONICALIZE)
12330 return true;
12331
12332 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12333 const auto &F = CFP->getValueAPF();
12334 if (F.isNaN() && F.isSignaling())
12335 return false;
12336 if (!F.isDenormal())
12337 return true;
12338
12339 DenormalMode Mode =
12340 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12341 return Mode == DenormalMode::getIEEE();
12342 }
12343
12344 // If source is a result of another standard FP operation it is already in
12345 // canonical form.
12346 if (MaxDepth == 0)
12347 return false;
12348
12349 switch (Opcode) {
12350 // These will flush denorms if required.
12351 case ISD::FADD:
12352 case ISD::FSUB:
12353 case ISD::FMUL:
12354 case ISD::FCEIL:
12355 case ISD::FFLOOR:
12356 case ISD::FMA:
12357 case ISD::FMAD:
12358 case ISD::FSQRT:
12359 case ISD::FDIV:
12360 case ISD::FREM:
12361 case ISD::FP_ROUND:
12362 case ISD::FP_EXTEND:
12363 case ISD::FLDEXP:
12366 case AMDGPUISD::RCP:
12367 case AMDGPUISD::RSQ:
12371 case AMDGPUISD::LOG:
12372 case AMDGPUISD::EXP:
12376 case AMDGPUISD::FRACT:
12382 return true;
12383
12384 // It can/will be lowered or combined as a bit operation.
12385 // Need to check their input recursively to handle.
12386 case ISD::FNEG:
12387 case ISD::FABS:
12388 case ISD::FCOPYSIGN:
12389 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12390
12391 case ISD::FSIN:
12392 case ISD::FCOS:
12393 case ISD::FSINCOS:
12394 return Op.getValueType().getScalarType() != MVT::f16;
12395
12396 case ISD::FMINNUM:
12397 case ISD::FMAXNUM:
12398 case ISD::FMINNUM_IEEE:
12399 case ISD::FMAXNUM_IEEE:
12400 case ISD::FMINIMUM:
12401 case ISD::FMAXIMUM:
12402 case AMDGPUISD::CLAMP:
12403 case AMDGPUISD::FMED3:
12404 case AMDGPUISD::FMAX3:
12405 case AMDGPUISD::FMIN3:
12407 case AMDGPUISD::FMINIMUM3: {
12408 // FIXME: Shouldn't treat the generic operations different based these.
12409 // However, we aren't really required to flush the result from
12410 // minnum/maxnum..
12411
12412 // snans will be quieted, so we only need to worry about denormals.
12413 if (Subtarget->supportsMinMaxDenormModes() ||
12414 // FIXME: denormalsEnabledForType is broken for dynamic
12415 denormalsEnabledForType(DAG, Op.getValueType()))
12416 return true;
12417
12418 // Flushing may be required.
12419 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12420 // targets need to check their input recursively.
12421
12422 // FIXME: Does this apply with clamp? It's implemented with max.
12423 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12424 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12425 return false;
12426 }
12427
12428 return true;
12429 }
12430 case ISD::SELECT: {
12431 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12432 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12433 }
12434 case ISD::BUILD_VECTOR: {
12435 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12436 SDValue SrcOp = Op.getOperand(i);
12437 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12438 return false;
12439 }
12440
12441 return true;
12442 }
12445 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12446 }
12448 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12449 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12450 }
12451 case ISD::UNDEF:
12452 // Could be anything.
12453 return false;
12454
12455 case ISD::BITCAST:
12456 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12457 case ISD::TRUNCATE: {
12458 // Hack round the mess we make when legalizing extract_vector_elt
12459 if (Op.getValueType() == MVT::i16) {
12460 SDValue TruncSrc = Op.getOperand(0);
12461 if (TruncSrc.getValueType() == MVT::i32 &&
12462 TruncSrc.getOpcode() == ISD::BITCAST &&
12463 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12464 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12465 }
12466 }
12467 return false;
12468 }
12470 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12471 // TODO: Handle more intrinsics
12472 switch (IntrinsicID) {
12473 case Intrinsic::amdgcn_cvt_pkrtz:
12474 case Intrinsic::amdgcn_cubeid:
12475 case Intrinsic::amdgcn_frexp_mant:
12476 case Intrinsic::amdgcn_fdot2:
12477 case Intrinsic::amdgcn_rcp:
12478 case Intrinsic::amdgcn_rsq:
12479 case Intrinsic::amdgcn_rsq_clamp:
12480 case Intrinsic::amdgcn_rcp_legacy:
12481 case Intrinsic::amdgcn_rsq_legacy:
12482 case Intrinsic::amdgcn_trig_preop:
12483 case Intrinsic::amdgcn_log:
12484 case Intrinsic::amdgcn_exp2:
12485 return true;
12486 default:
12487 break;
12488 }
12489
12490 [[fallthrough]];
12491 }
12492 default:
12493 // FIXME: denormalsEnabledForType is broken for dynamic
12494 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12495 DAG.isKnownNeverSNaN(Op);
12496 }
12497
12498 llvm_unreachable("invalid operation");
12499}
12500
12502 unsigned MaxDepth) const {
12504 MachineInstr *MI = MRI.getVRegDef(Reg);
12505 unsigned Opcode = MI->getOpcode();
12506
12507 if (Opcode == AMDGPU::G_FCANONICALIZE)
12508 return true;
12509
12510 std::optional<FPValueAndVReg> FCR;
12511 // Constant splat (can be padded with undef) or scalar constant.
12512 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12513 if (FCR->Value.isSignaling())
12514 return false;
12515 if (!FCR->Value.isDenormal())
12516 return true;
12517
12518 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12519 return Mode == DenormalMode::getIEEE();
12520 }
12521
12522 if (MaxDepth == 0)
12523 return false;
12524
12525 switch (Opcode) {
12526 case AMDGPU::G_FADD:
12527 case AMDGPU::G_FSUB:
12528 case AMDGPU::G_FMUL:
12529 case AMDGPU::G_FCEIL:
12530 case AMDGPU::G_FFLOOR:
12531 case AMDGPU::G_FRINT:
12532 case AMDGPU::G_FNEARBYINT:
12533 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12534 case AMDGPU::G_INTRINSIC_TRUNC:
12535 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12536 case AMDGPU::G_FMA:
12537 case AMDGPU::G_FMAD:
12538 case AMDGPU::G_FSQRT:
12539 case AMDGPU::G_FDIV:
12540 case AMDGPU::G_FREM:
12541 case AMDGPU::G_FPOW:
12542 case AMDGPU::G_FPEXT:
12543 case AMDGPU::G_FLOG:
12544 case AMDGPU::G_FLOG2:
12545 case AMDGPU::G_FLOG10:
12546 case AMDGPU::G_FPTRUNC:
12547 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12548 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12549 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12550 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12551 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12552 return true;
12553 case AMDGPU::G_FNEG:
12554 case AMDGPU::G_FABS:
12555 case AMDGPU::G_FCOPYSIGN:
12556 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12557 case AMDGPU::G_FMINNUM:
12558 case AMDGPU::G_FMAXNUM:
12559 case AMDGPU::G_FMINNUM_IEEE:
12560 case AMDGPU::G_FMAXNUM_IEEE:
12561 case AMDGPU::G_FMINIMUM:
12562 case AMDGPU::G_FMAXIMUM: {
12563 if (Subtarget->supportsMinMaxDenormModes() ||
12564 // FIXME: denormalsEnabledForType is broken for dynamic
12565 denormalsEnabledForType(MRI.getType(Reg), MF))
12566 return true;
12567
12568 [[fallthrough]];
12569 }
12570 case AMDGPU::G_BUILD_VECTOR:
12571 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12572 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12573 return false;
12574 return true;
12575 case AMDGPU::G_INTRINSIC:
12576 case AMDGPU::G_INTRINSIC_CONVERGENT:
12577 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12578 case Intrinsic::amdgcn_fmul_legacy:
12579 case Intrinsic::amdgcn_fmad_ftz:
12580 case Intrinsic::amdgcn_sqrt:
12581 case Intrinsic::amdgcn_fmed3:
12582 case Intrinsic::amdgcn_sin:
12583 case Intrinsic::amdgcn_cos:
12584 case Intrinsic::amdgcn_log:
12585 case Intrinsic::amdgcn_exp2:
12586 case Intrinsic::amdgcn_log_clamp:
12587 case Intrinsic::amdgcn_rcp:
12588 case Intrinsic::amdgcn_rcp_legacy:
12589 case Intrinsic::amdgcn_rsq:
12590 case Intrinsic::amdgcn_rsq_clamp:
12591 case Intrinsic::amdgcn_rsq_legacy:
12592 case Intrinsic::amdgcn_div_scale:
12593 case Intrinsic::amdgcn_div_fmas:
12594 case Intrinsic::amdgcn_div_fixup:
12595 case Intrinsic::amdgcn_fract:
12596 case Intrinsic::amdgcn_cvt_pkrtz:
12597 case Intrinsic::amdgcn_cubeid:
12598 case Intrinsic::amdgcn_cubema:
12599 case Intrinsic::amdgcn_cubesc:
12600 case Intrinsic::amdgcn_cubetc:
12601 case Intrinsic::amdgcn_frexp_mant:
12602 case Intrinsic::amdgcn_fdot2:
12603 case Intrinsic::amdgcn_trig_preop:
12604 return true;
12605 default:
12606 break;
12607 }
12608
12609 [[fallthrough]];
12610 default:
12611 return false;
12612 }
12613
12614 llvm_unreachable("invalid operation");
12615}
12616
12617// Constant fold canonicalize.
12618SDValue SITargetLowering::getCanonicalConstantFP(
12619 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12620 // Flush denormals to 0 if not enabled.
12621 if (C.isDenormal()) {
12622 DenormalMode Mode =
12623 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12624 if (Mode == DenormalMode::getPreserveSign()) {
12625 return DAG.getConstantFP(
12626 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12627 }
12628
12629 if (Mode != DenormalMode::getIEEE())
12630 return SDValue();
12631 }
12632
12633 if (C.isNaN()) {
12634 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12635 if (C.isSignaling()) {
12636 // Quiet a signaling NaN.
12637 // FIXME: Is this supposed to preserve payload bits?
12638 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12639 }
12640
12641 // Make sure it is the canonical NaN bitpattern.
12642 //
12643 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12644 // immediate?
12645 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12646 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12647 }
12648
12649 // Already canonical.
12650 return DAG.getConstantFP(C, SL, VT);
12651}
12652
12654 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12655}
12656
12657SDValue SITargetLowering::performFCanonicalizeCombine(
12658 SDNode *N,
12659 DAGCombinerInfo &DCI) const {
12660 SelectionDAG &DAG = DCI.DAG;
12661 SDValue N0 = N->getOperand(0);
12662 EVT VT = N->getValueType(0);
12663
12664 // fcanonicalize undef -> qnan
12665 if (N0.isUndef()) {
12667 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12668 }
12669
12670 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12671 EVT VT = N->getValueType(0);
12672 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12673 }
12674
12675 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12676 // (fcanonicalize k)
12677 //
12678 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12679
12680 // TODO: This could be better with wider vectors that will be split to v2f16,
12681 // and to consider uses since there aren't that many packed operations.
12682 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12683 isTypeLegal(MVT::v2f16)) {
12684 SDLoc SL(N);
12685 SDValue NewElts[2];
12686 SDValue Lo = N0.getOperand(0);
12687 SDValue Hi = N0.getOperand(1);
12688 EVT EltVT = Lo.getValueType();
12689
12691 for (unsigned I = 0; I != 2; ++I) {
12692 SDValue Op = N0.getOperand(I);
12693 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12694 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12695 CFP->getValueAPF());
12696 } else if (Op.isUndef()) {
12697 // Handled below based on what the other operand is.
12698 NewElts[I] = Op;
12699 } else {
12700 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12701 }
12702 }
12703
12704 // If one half is undef, and one is constant, prefer a splat vector rather
12705 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12706 // cheaper to use and may be free with a packed operation.
12707 if (NewElts[0].isUndef()) {
12708 if (isa<ConstantFPSDNode>(NewElts[1]))
12709 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12710 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
12711 }
12712
12713 if (NewElts[1].isUndef()) {
12714 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12715 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
12716 }
12717
12718 return DAG.getBuildVector(VT, SL, NewElts);
12719 }
12720 }
12721
12722 unsigned SrcOpc = N0.getOpcode();
12723
12724 // If it's free to do so, push canonicalizes further up the source, which may
12725 // find a canonical source.
12726 //
12727 // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
12728 // sNaNs.
12729 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
12730 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
12731 if (CRHS && N0.hasOneUse()) {
12732 SDLoc SL(N);
12733 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
12734 N0.getOperand(0));
12735 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
12736 DCI.AddToWorklist(Canon0.getNode());
12737
12738 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
12739 }
12740 }
12741
12742 return isCanonicalized(DAG, N0) ? N0 : SDValue();
12743}
12744
12745static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
12746 switch (Opc) {
12747 case ISD::FMAXNUM:
12748 case ISD::FMAXNUM_IEEE:
12749 return AMDGPUISD::FMAX3;
12750 case ISD::FMAXIMUM:
12751 return AMDGPUISD::FMAXIMUM3;
12752 case ISD::SMAX:
12753 return AMDGPUISD::SMAX3;
12754 case ISD::UMAX:
12755 return AMDGPUISD::UMAX3;
12756 case ISD::FMINNUM:
12757 case ISD::FMINNUM_IEEE:
12758 return AMDGPUISD::FMIN3;
12759 case ISD::FMINIMUM:
12760 return AMDGPUISD::FMINIMUM3;
12761 case ISD::SMIN:
12762 return AMDGPUISD::SMIN3;
12763 case ISD::UMIN:
12764 return AMDGPUISD::UMIN3;
12765 default:
12766 llvm_unreachable("Not a min/max opcode");
12767 }
12768}
12769
12770SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
12771 const SDLoc &SL, SDValue Src,
12772 SDValue MinVal,
12773 SDValue MaxVal,
12774 bool Signed) const {
12775
12776 // med3 comes from
12777 // min(max(x, K0), K1), K0 < K1
12778 // max(min(x, K0), K1), K1 < K0
12779 //
12780 // "MinVal" and "MaxVal" respectively refer to the rhs of the
12781 // min/max op.
12782 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
12783 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
12784
12785 if (!MinK || !MaxK)
12786 return SDValue();
12787
12788 if (Signed) {
12789 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
12790 return SDValue();
12791 } else {
12792 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
12793 return SDValue();
12794 }
12795
12796 EVT VT = MinK->getValueType(0);
12797 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
12798 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
12799 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
12800
12801 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
12802 // not available, but this is unlikely to be profitable as constants
12803 // will often need to be materialized & extended, especially on
12804 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
12805 return SDValue();
12806}
12807
12809 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
12810 return C;
12811
12812 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
12813 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
12814 return C;
12815 }
12816
12817 return nullptr;
12818}
12819
12820SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
12821 const SDLoc &SL,
12822 SDValue Op0,
12823 SDValue Op1) const {
12825 if (!K1)
12826 return SDValue();
12827
12829 if (!K0)
12830 return SDValue();
12831
12832 // Ordered >= (although NaN inputs should have folded away by now).
12833 if (K0->getValueAPF() > K1->getValueAPF())
12834 return SDValue();
12835
12836 const MachineFunction &MF = DAG.getMachineFunction();
12838
12839 // TODO: Check IEEE bit enabled?
12840 EVT VT = Op0.getValueType();
12841 if (Info->getMode().DX10Clamp) {
12842 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
12843 // hardware fmed3 behavior converting to a min.
12844 // FIXME: Should this be allowing -0.0?
12845 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
12846 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
12847 }
12848
12849 // med3 for f16 is only available on gfx9+, and not available for v2f16.
12850 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
12851 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
12852 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
12853 // then give the other result, which is different from med3 with a NaN
12854 // input.
12855 SDValue Var = Op0.getOperand(0);
12856 if (!DAG.isKnownNeverSNaN(Var))
12857 return SDValue();
12858
12860
12861 if ((!K0->hasOneUse() ||
12862 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
12863 (!K1->hasOneUse() ||
12864 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
12865 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
12866 Var, SDValue(K0, 0), SDValue(K1, 0));
12867 }
12868 }
12869
12870 return SDValue();
12871}
12872
12873SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
12874 DAGCombinerInfo &DCI) const {
12875 SelectionDAG &DAG = DCI.DAG;
12876
12877 EVT VT = N->getValueType(0);
12878 unsigned Opc = N->getOpcode();
12879 SDValue Op0 = N->getOperand(0);
12880 SDValue Op1 = N->getOperand(1);
12881
12882 // Only do this if the inner op has one use since this will just increases
12883 // register pressure for no benefit.
12884
12885 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
12886 !VT.isVector() &&
12887 (VT == MVT::i32 || VT == MVT::f32 ||
12888 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
12889 // max(max(a, b), c) -> max3(a, b, c)
12890 // min(min(a, b), c) -> min3(a, b, c)
12891 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
12892 SDLoc DL(N);
12893 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
12894 DL,
12895 N->getValueType(0),
12896 Op0.getOperand(0),
12897 Op0.getOperand(1),
12898 Op1);
12899 }
12900
12901 // Try commuted.
12902 // max(a, max(b, c)) -> max3(a, b, c)
12903 // min(a, min(b, c)) -> min3(a, b, c)
12904 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
12905 SDLoc DL(N);
12906 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
12907 DL,
12908 N->getValueType(0),
12909 Op0,
12910 Op1.getOperand(0),
12911 Op1.getOperand(1));
12912 }
12913 }
12914
12915 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
12916 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
12917 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
12918 if (SDValue Med3 = performIntMed3ImmCombine(
12919 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
12920 return Med3;
12921 }
12922 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
12923 if (SDValue Med3 = performIntMed3ImmCombine(
12924 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
12925 return Med3;
12926 }
12927
12928 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
12929 if (SDValue Med3 = performIntMed3ImmCombine(
12930 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
12931 return Med3;
12932 }
12933 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
12934 if (SDValue Med3 = performIntMed3ImmCombine(
12935 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
12936 return Med3;
12937 }
12938
12939 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
12940 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
12941 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
12942 (Opc == AMDGPUISD::FMIN_LEGACY &&
12943 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
12944 (VT == MVT::f32 || VT == MVT::f64 ||
12945 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
12946 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
12947 Op0.hasOneUse()) {
12948 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
12949 return Res;
12950 }
12951
12952 return SDValue();
12953}
12954
12956 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
12957 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
12958 // FIXME: Should this be allowing -0.0?
12959 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
12960 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
12961 }
12962 }
12963
12964 return false;
12965}
12966
12967// FIXME: Should only worry about snans for version with chain.
12968SDValue SITargetLowering::performFMed3Combine(SDNode *N,
12969 DAGCombinerInfo &DCI) const {
12970 EVT VT = N->getValueType(0);
12971 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
12972 // NaNs. With a NaN input, the order of the operands may change the result.
12973
12974 SelectionDAG &DAG = DCI.DAG;
12975 SDLoc SL(N);
12976
12977 SDValue Src0 = N->getOperand(0);
12978 SDValue Src1 = N->getOperand(1);
12979 SDValue Src2 = N->getOperand(2);
12980
12981 if (isClampZeroToOne(Src0, Src1)) {
12982 // const_a, const_b, x -> clamp is safe in all cases including signaling
12983 // nans.
12984 // FIXME: Should this be allowing -0.0?
12985 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
12986 }
12987
12988 const MachineFunction &MF = DAG.getMachineFunction();
12990
12991 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
12992 // handling no dx10-clamp?
12993 if (Info->getMode().DX10Clamp) {
12994 // If NaNs is clamped to 0, we are free to reorder the inputs.
12995
12996 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
12997 std::swap(Src0, Src1);
12998
12999 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13000 std::swap(Src1, Src2);
13001
13002 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13003 std::swap(Src0, Src1);
13004
13005 if (isClampZeroToOne(Src1, Src2))
13006 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13007 }
13008
13009 return SDValue();
13010}
13011
13012SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13013 DAGCombinerInfo &DCI) const {
13014 SDValue Src0 = N->getOperand(0);
13015 SDValue Src1 = N->getOperand(1);
13016 if (Src0.isUndef() && Src1.isUndef())
13017 return DCI.DAG.getUNDEF(N->getValueType(0));
13018 return SDValue();
13019}
13020
13021// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13022// expanded into a set of cmp/select instructions.
13024 unsigned NumElem,
13025 bool IsDivergentIdx,
13026 const GCNSubtarget *Subtarget) {
13028 return false;
13029
13030 unsigned VecSize = EltSize * NumElem;
13031
13032 // Sub-dword vectors of size 2 dword or less have better implementation.
13033 if (VecSize <= 64 && EltSize < 32)
13034 return false;
13035
13036 // Always expand the rest of sub-dword instructions, otherwise it will be
13037 // lowered via memory.
13038 if (EltSize < 32)
13039 return true;
13040
13041 // Always do this if var-idx is divergent, otherwise it will become a loop.
13042 if (IsDivergentIdx)
13043 return true;
13044
13045 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13046 unsigned NumInsts = NumElem /* Number of compares */ +
13047 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13048
13049 // On some architectures (GFX9) movrel is not available and it's better
13050 // to expand.
13051 if (!Subtarget->hasMovrel())
13052 return NumInsts <= 16;
13053
13054 // If movrel is available, use it instead of expanding for vector of 8
13055 // elements.
13056 return NumInsts <= 15;
13057}
13058
13060 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13061 if (isa<ConstantSDNode>(Idx))
13062 return false;
13063
13064 SDValue Vec = N->getOperand(0);
13065 EVT VecVT = Vec.getValueType();
13066 EVT EltVT = VecVT.getVectorElementType();
13067 unsigned EltSize = EltVT.getSizeInBits();
13068 unsigned NumElem = VecVT.getVectorNumElements();
13069
13071 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13072}
13073
13074SDValue SITargetLowering::performExtractVectorEltCombine(
13075 SDNode *N, DAGCombinerInfo &DCI) const {
13076 SDValue Vec = N->getOperand(0);
13077 SelectionDAG &DAG = DCI.DAG;
13078
13079 EVT VecVT = Vec.getValueType();
13080 EVT VecEltVT = VecVT.getVectorElementType();
13081 EVT ResVT = N->getValueType(0);
13082
13083 unsigned VecSize = VecVT.getSizeInBits();
13084 unsigned VecEltSize = VecEltVT.getSizeInBits();
13085
13086 if ((Vec.getOpcode() == ISD::FNEG ||
13088 SDLoc SL(N);
13089 SDValue Idx = N->getOperand(1);
13090 SDValue Elt =
13091 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13092 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13093 }
13094
13095 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13096 // =>
13097 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13098 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13099 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13100 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13101 SDLoc SL(N);
13102 SDValue Idx = N->getOperand(1);
13103 unsigned Opc = Vec.getOpcode();
13104
13105 switch(Opc) {
13106 default:
13107 break;
13108 // TODO: Support other binary operations.
13109 case ISD::FADD:
13110 case ISD::FSUB:
13111 case ISD::FMUL:
13112 case ISD::ADD:
13113 case ISD::UMIN:
13114 case ISD::UMAX:
13115 case ISD::SMIN:
13116 case ISD::SMAX:
13117 case ISD::FMAXNUM:
13118 case ISD::FMINNUM:
13119 case ISD::FMAXNUM_IEEE:
13120 case ISD::FMINNUM_IEEE:
13121 case ISD::FMAXIMUM:
13122 case ISD::FMINIMUM: {
13123 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13124 Vec.getOperand(0), Idx);
13125 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13126 Vec.getOperand(1), Idx);
13127
13128 DCI.AddToWorklist(Elt0.getNode());
13129 DCI.AddToWorklist(Elt1.getNode());
13130 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13131 }
13132 }
13133 }
13134
13135 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13137 SDLoc SL(N);
13138 SDValue Idx = N->getOperand(1);
13139 SDValue V;
13140 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13141 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13142 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13143 if (I == 0)
13144 V = Elt;
13145 else
13146 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13147 }
13148 return V;
13149 }
13150
13151 if (!DCI.isBeforeLegalize())
13152 return SDValue();
13153
13154 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13155 // elements. This exposes more load reduction opportunities by replacing
13156 // multiple small extract_vector_elements with a single 32-bit extract.
13157 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13158 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13159 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13160 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13161
13162 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13163 unsigned EltIdx = BitIndex / 32;
13164 unsigned LeftoverBitIdx = BitIndex % 32;
13165 SDLoc SL(N);
13166
13167 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13168 DCI.AddToWorklist(Cast.getNode());
13169
13170 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13171 DAG.getConstant(EltIdx, SL, MVT::i32));
13172 DCI.AddToWorklist(Elt.getNode());
13173 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13174 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13175 DCI.AddToWorklist(Srl.getNode());
13176
13177 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13178 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13179 DCI.AddToWorklist(Trunc.getNode());
13180
13181 if (VecEltVT == ResVT) {
13182 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13183 }
13184
13185 assert(ResVT.isScalarInteger());
13186 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13187 }
13188
13189 return SDValue();
13190}
13191
13192SDValue
13193SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13194 DAGCombinerInfo &DCI) const {
13195 SDValue Vec = N->getOperand(0);
13196 SDValue Idx = N->getOperand(2);
13197 EVT VecVT = Vec.getValueType();
13198 EVT EltVT = VecVT.getVectorElementType();
13199
13200 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13201 // => BUILD_VECTOR n x select (e, const-idx)
13203 return SDValue();
13204
13205 SelectionDAG &DAG = DCI.DAG;
13206 SDLoc SL(N);
13207 SDValue Ins = N->getOperand(1);
13208 EVT IdxVT = Idx.getValueType();
13209
13211 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13212 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13213 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13214 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13215 Ops.push_back(V);
13216 }
13217
13218 return DAG.getBuildVector(VecVT, SL, Ops);
13219}
13220
13221/// Return the source of an fp_extend from f16 to f32, or a converted FP
13222/// constant.
13224 if (Src.getOpcode() == ISD::FP_EXTEND &&
13225 Src.getOperand(0).getValueType() == MVT::f16) {
13226 return Src.getOperand(0);
13227 }
13228
13229 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13230 APFloat Val = CFP->getValueAPF();
13231 bool LosesInfo = true;
13233 if (!LosesInfo)
13234 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13235 }
13236
13237 return SDValue();
13238}
13239
13240SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13241 DAGCombinerInfo &DCI) const {
13242 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13243 "combine only useful on gfx8");
13244
13245 SDValue TruncSrc = N->getOperand(0);
13246 EVT VT = N->getValueType(0);
13247 if (VT != MVT::f16)
13248 return SDValue();
13249
13250 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13251 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13252 return SDValue();
13253
13254 SelectionDAG &DAG = DCI.DAG;
13255 SDLoc SL(N);
13256
13257 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13258 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13259 // casting back.
13260
13261 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13262 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13263 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13264 if (!A)
13265 return SDValue();
13266
13267 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13268 if (!B)
13269 return SDValue();
13270
13271 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13272 if (!C)
13273 return SDValue();
13274
13275 // This changes signaling nan behavior. If an input is a signaling nan, it
13276 // would have been quieted by the fpext originally. We don't care because
13277 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13278 // we would be worse off than just doing the promotion.
13279 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13280 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13281 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13282 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13283}
13284
13285unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13286 const SDNode *N0,
13287 const SDNode *N1) const {
13288 EVT VT = N0->getValueType(0);
13289
13290 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13291 // support denormals ever.
13292 if (((VT == MVT::f32 &&
13294 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13297 return ISD::FMAD;
13298
13299 const TargetOptions &Options = DAG.getTarget().Options;
13300 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13301 (N0->getFlags().hasAllowContract() &&
13302 N1->getFlags().hasAllowContract())) &&
13304 return ISD::FMA;
13305 }
13306
13307 return 0;
13308}
13309
13310// For a reassociatable opcode perform:
13311// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13312SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13313 SelectionDAG &DAG) const {
13314 EVT VT = N->getValueType(0);
13315 if (VT != MVT::i32 && VT != MVT::i64)
13316 return SDValue();
13317
13318 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13319 return SDValue();
13320
13321 unsigned Opc = N->getOpcode();
13322 SDValue Op0 = N->getOperand(0);
13323 SDValue Op1 = N->getOperand(1);
13324
13325 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13326 return SDValue();
13327
13328 if (Op0->isDivergent())
13329 std::swap(Op0, Op1);
13330
13331 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13332 return SDValue();
13333
13334 SDValue Op2 = Op1.getOperand(1);
13335 Op1 = Op1.getOperand(0);
13336 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13337 return SDValue();
13338
13339 if (Op1->isDivergent())
13340 std::swap(Op1, Op2);
13341
13342 SDLoc SL(N);
13343 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13344 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13345}
13346
13347static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13348 EVT VT,
13349 SDValue N0, SDValue N1, SDValue N2,
13350 bool Signed) {
13352 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13353 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13354 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13355}
13356
13357// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13358// multiplies, if any.
13359//
13360// Full 64-bit multiplies that feed into an addition are lowered here instead
13361// of using the generic expansion. The generic expansion ends up with
13362// a tree of ADD nodes that prevents us from using the "add" part of the
13363// MAD instruction. The expansion produced here results in a chain of ADDs
13364// instead of a tree.
13365SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13366 DAGCombinerInfo &DCI) const {
13367 assert(N->getOpcode() == ISD::ADD);
13368
13369 SelectionDAG &DAG = DCI.DAG;
13370 EVT VT = N->getValueType(0);
13371 SDLoc SL(N);
13372 SDValue LHS = N->getOperand(0);
13373 SDValue RHS = N->getOperand(1);
13374
13375 if (VT.isVector())
13376 return SDValue();
13377
13378 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13379 // result in scalar registers for uniform values.
13380 if (!N->isDivergent() && Subtarget->hasSMulHi())
13381 return SDValue();
13382
13383 unsigned NumBits = VT.getScalarSizeInBits();
13384 if (NumBits <= 32 || NumBits > 64)
13385 return SDValue();
13386
13387 if (LHS.getOpcode() != ISD::MUL) {
13388 assert(RHS.getOpcode() == ISD::MUL);
13389 std::swap(LHS, RHS);
13390 }
13391
13392 // Avoid the fold if it would unduly increase the number of multiplies due to
13393 // multiple uses, except on hardware with full-rate multiply-add (which is
13394 // part of full-rate 64-bit ops).
13395 if (!Subtarget->hasFullRate64Ops()) {
13396 unsigned NumUsers = 0;
13397 for (SDNode *Use : LHS->uses()) {
13398 // There is a use that does not feed into addition, so the multiply can't
13399 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13400 if (Use->getOpcode() != ISD::ADD)
13401 return SDValue();
13402
13403 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13404 // MUL + 3xADD + 3xADDC over 3xMAD.
13405 ++NumUsers;
13406 if (NumUsers >= 3)
13407 return SDValue();
13408 }
13409 }
13410
13411 SDValue MulLHS = LHS.getOperand(0);
13412 SDValue MulRHS = LHS.getOperand(1);
13413 SDValue AddRHS = RHS;
13414
13415 // Always check whether operands are small unsigned values, since that
13416 // knowledge is useful in more cases. Check for small signed values only if
13417 // doing so can unlock a shorter code sequence.
13418 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13419 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13420
13421 bool MulSignedLo = false;
13422 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13423 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13424 numBitsSigned(MulRHS, DAG) <= 32;
13425 }
13426
13427 // The operands and final result all have the same number of bits. If
13428 // operands need to be extended, they can be extended with garbage. The
13429 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13430 // truncated away in the end.
13431 if (VT != MVT::i64) {
13432 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13433 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13434 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13435 }
13436
13437 // The basic code generated is conceptually straightforward. Pseudo code:
13438 //
13439 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13440 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13441 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13442 //
13443 // The second and third lines are optional, depending on whether the factors
13444 // are {sign,zero}-extended or not.
13445 //
13446 // The actual DAG is noisier than the pseudo code, but only due to
13447 // instructions that disassemble values into low and high parts, and
13448 // assemble the final result.
13449 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13450
13451 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13452 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13453 SDValue Accum =
13454 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13455
13456 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13457 SDValue AccumLo, AccumHi;
13458 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13459
13460 if (!MulLHSUnsigned32) {
13461 auto MulLHSHi =
13462 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13463 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13464 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13465 }
13466
13467 if (!MulRHSUnsigned32) {
13468 auto MulRHSHi =
13469 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13470 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13471 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13472 }
13473
13474 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13475 Accum = DAG.getBitcast(MVT::i64, Accum);
13476 }
13477
13478 if (VT != MVT::i64)
13479 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13480 return Accum;
13481}
13482
13483// Collect the ultimate src of each of the mul node's operands, and confirm
13484// each operand is 8 bytes.
13485static std::optional<ByteProvider<SDValue>>
13486handleMulOperand(const SDValue &MulOperand) {
13487 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13488 if (!Byte0 || Byte0->isConstantZero()) {
13489 return std::nullopt;
13490 }
13491 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13492 if (Byte1 && !Byte1->isConstantZero()) {
13493 return std::nullopt;
13494 }
13495 return Byte0;
13496}
13497
13498static unsigned addPermMasks(unsigned First, unsigned Second) {
13499 unsigned FirstCs = First & 0x0c0c0c0c;
13500 unsigned SecondCs = Second & 0x0c0c0c0c;
13501 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13502 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13503
13504 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13505 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13506 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13507 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13508
13509 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13510}
13511
13514 SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
13515 SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
13516 int Step) {
13517
13518 assert(Src0.Src.has_value() && Src1.Src.has_value());
13519 // Src0s and Src1s are empty, just place arbitrarily.
13520 if (Step == 0) {
13521 Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
13522 Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
13523 return;
13524 }
13525
13526 for (int BPI = 0; BPI < 2; BPI++) {
13527 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13528 if (BPI == 1) {
13529 BPP = {Src1, Src0};
13530 }
13531 unsigned ZeroMask = 0x0c0c0c0c;
13532 unsigned FMask = 0xFF << (8 * (3 - Step));
13533
13534 unsigned FirstMask =
13535 BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13536 unsigned SecondMask =
13537 BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13538 // Attempt to find Src vector which contains our SDValue, if so, add our
13539 // perm mask to the existing one. If we are unable to find a match for the
13540 // first SDValue, attempt to find match for the second.
13541 int FirstGroup = -1;
13542 for (int I = 0; I < 2; I++) {
13544 I == 0 ? Src0s : Src1s;
13545 auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
13546 return IterElt.first == *BPP.first.Src;
13547 };
13548
13549 auto Match = llvm::find_if(Srcs, MatchesFirst);
13550 if (Match != Srcs.end()) {
13551 Match->second = addPermMasks(FirstMask, Match->second);
13552 FirstGroup = I;
13553 break;
13554 }
13555 }
13556 if (FirstGroup != -1) {
13558 FirstGroup == 1 ? Src0s : Src1s;
13559 auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
13560 return IterElt.first == *BPP.second.Src;
13561 };
13562 auto Match = llvm::find_if(Srcs, MatchesSecond);
13563 if (Match != Srcs.end()) {
13564 Match->second = addPermMasks(SecondMask, Match->second);
13565 } else
13566 Srcs.push_back({*BPP.second.Src, SecondMask});
13567 return;
13568 }
13569 }
13570
13571 // If we have made it here, then we could not find a match in Src0s or Src1s
13572 // for either Src0 or Src1, so just place them arbitrarily.
13573
13574 unsigned ZeroMask = 0x0c0c0c0c;
13575 unsigned FMask = 0xFF << (8 * (3 - Step));
13576
13577 Src0s.push_back(
13578 {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
13579 Src1s.push_back(
13580 {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
13581
13582 return;
13583}
13584
13585static SDValue
13587 SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
13588 bool IsSigned, bool IsAny) {
13589
13590 // If we just have one source, just permute it accordingly.
13591 if (Srcs.size() == 1) {
13592 auto Elt = Srcs.begin();
13593 auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
13594
13595 // v_perm will produce the original value.
13596 if (Elt->second == 0x3020100)
13597 return EltVal;
13598
13599 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
13600 DAG.getConstant(Elt->second, SL, MVT::i32));
13601 }
13602
13603 auto FirstElt = Srcs.begin();
13604 auto SecondElt = std::next(FirstElt);
13605
13607
13608 // If we have multiple sources in the chain, combine them via perms (using
13609 // calculated perm mask) and Ors.
13610 while (true) {
13611 auto FirstMask = FirstElt->second;
13612 auto SecondMask = SecondElt->second;
13613
13614 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13615 unsigned FirstPlusFour = FirstMask | 0x04040404;
13616 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13617 // original 0x0C.
13618 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13619
13620 auto PermMask = addPermMasks(FirstMask, SecondMask);
13621 auto FirstVal =
13622 DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
13623 auto SecondVal =
13624 DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
13625
13626 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13627 SecondVal,
13628 DAG.getConstant(PermMask, SL, MVT::i32)));
13629
13630 FirstElt = std::next(SecondElt);
13631 if (FirstElt == Srcs.end())
13632 break;
13633
13634 SecondElt = std::next(FirstElt);
13635 // If we only have a FirstElt, then just combine that into the cumulative
13636 // source node.
13637 if (SecondElt == Srcs.end()) {
13638 auto EltVal =
13639 DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
13640
13641 Perms.push_back(
13642 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
13643 DAG.getConstant(FirstElt->second, SL, MVT::i32)));
13644 break;
13645 }
13646 }
13647
13648 assert(Perms.size() == 1 || Perms.size() == 2);
13649 return Perms.size() == 2
13650 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13651 : Perms[0];
13652}
13653
13654static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
13655 unsigned ChainLength) {
13656 for (auto &[EntryVal, EntryMask] : Srcs) {
13657 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13658 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13659 EntryMask += ZeroMask;
13660 }
13661}
13662
13663static bool isMul(const SDValue Op) {
13664 auto Opcode = Op.getOpcode();
13665
13666 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13667 Opcode == AMDGPUISD::MUL_I24);
13668}
13669
13670static std::optional<bool>
13672 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13673 const SDValue &S1Op, const SelectionDAG &DAG) {
13674 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13675 // of the dot4 is irrelevant.
13676 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13677 return false;
13678
13679 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13680 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13681 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13682 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13683 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13684 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13685
13686 assert(!(S0IsUnsigned && S0IsSigned));
13687 assert(!(S1IsUnsigned && S1IsSigned));
13688
13689 // There are 9 possible permutations of
13690 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13691
13692 // In two permutations, the sign bits are known to be the same for both Ops,
13693 // so simply return Signed / Unsigned corresponding to the MSB
13694
13695 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13696 return S0IsSigned;
13697
13698 // In another two permutations, the sign bits are known to be opposite. In
13699 // this case return std::nullopt to indicate a bad match.
13700
13701 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13702 return std::nullopt;
13703
13704 // In the remaining five permutations, we don't know the value of the sign
13705 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13706 // the upper bits must be extension bits. Thus, the only ways for the sign
13707 // bit to be unknown is if it was sign extended from unknown value, or if it
13708 // was any extended. In either case, it is correct to use the signed
13709 // version of the signedness semantics of dot4
13710
13711 // In two of such permutations, we known the sign bit is set for
13712 // one op, and the other is unknown. It is okay to used signed version of
13713 // dot4.
13714 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13715 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13716 return true;
13717
13718 // In one such permutation, we don't know either of the sign bits. It is okay
13719 // to used the signed version of dot4.
13720 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13721 return true;
13722
13723 // In two of such permutations, we known the sign bit is unset for
13724 // one op, and the other is unknown. Return std::nullopt to indicate a
13725 // bad match.
13726 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13727 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13728 return std::nullopt;
13729
13730 llvm_unreachable("Fully covered condition");
13731}
13732
13733SDValue SITargetLowering::performAddCombine(SDNode *N,
13734 DAGCombinerInfo &DCI) const {
13735 SelectionDAG &DAG = DCI.DAG;
13736 EVT VT = N->getValueType(0);
13737 SDLoc SL(N);
13738 SDValue LHS = N->getOperand(0);
13739 SDValue RHS = N->getOperand(1);
13740
13741 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
13742 if (Subtarget->hasMad64_32()) {
13743 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
13744 return Folded;
13745 }
13746 }
13747
13748 if (SDValue V = reassociateScalarOps(N, DAG)) {
13749 return V;
13750 }
13751
13752 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
13753 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13754 SDValue TempNode(N, 0);
13755 std::optional<bool> IsSigned;
13759
13760 // Match the v_dot4 tree, while collecting src nodes.
13761 int ChainLength = 0;
13762 for (int I = 0; I < 4; I++) {
13763 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
13764 if (MulIdx == -1)
13765 break;
13766 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13767 if (!Src0)
13768 break;
13769 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13770 if (!Src1)
13771 break;
13772
13773 auto IterIsSigned = checkDot4MulSignedness(
13774 TempNode->getOperand(MulIdx), *Src0, *Src1,
13775 TempNode->getOperand(MulIdx)->getOperand(0),
13776 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
13777 if (!IterIsSigned)
13778 break;
13779 if (!IsSigned)
13780 IsSigned = *IterIsSigned;
13781 if (*IterIsSigned != *IsSigned)
13782 break;
13783 placeSources(*Src0, *Src1, Src0s, Src1s, I);
13784 auto AddIdx = 1 - MulIdx;
13785 // Allow the special case where add (add (mul24, 0), mul24) became ->
13786 // add (mul24, mul24).
13787 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
13788 Src2s.push_back(TempNode->getOperand(AddIdx));
13789 auto Src0 =
13790 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
13791 if (!Src0)
13792 break;
13793 auto Src1 =
13794 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
13795 if (!Src1)
13796 break;
13797 auto IterIsSigned = checkDot4MulSignedness(
13798 TempNode->getOperand(AddIdx), *Src0, *Src1,
13799 TempNode->getOperand(AddIdx)->getOperand(0),
13800 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
13801 if (!IterIsSigned)
13802 break;
13803 assert(IsSigned);
13804 if (*IterIsSigned != *IsSigned)
13805 break;
13806 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
13807 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
13808 ChainLength = I + 2;
13809 break;
13810 }
13811
13812 TempNode = TempNode->getOperand(AddIdx);
13813 Src2s.push_back(TempNode);
13814 ChainLength = I + 1;
13815 if (TempNode->getNumOperands() < 2)
13816 break;
13817 LHS = TempNode->getOperand(0);
13818 RHS = TempNode->getOperand(1);
13819 }
13820
13821 if (ChainLength < 2)
13822 return SDValue();
13823
13824 // Masks were constructed with assumption that we would find a chain of
13825 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
13826 // 0x0c) so they do not affect dot calculation.
13827 if (ChainLength < 4) {
13828 fixMasks(Src0s, ChainLength);
13829 fixMasks(Src1s, ChainLength);
13830 }
13831
13832 SDValue Src0, Src1;
13833
13834 // If we are just using a single source for both, and have permuted the
13835 // bytes consistently, we can just use the sources without permuting
13836 // (commutation).
13837 bool UseOriginalSrc = false;
13838 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
13839 Src0s.begin()->second == Src1s.begin()->second &&
13840 Src0s.begin()->first.getValueSizeInBits() == 32 &&
13841 Src1s.begin()->first.getValueSizeInBits() == 32) {
13842 SmallVector<unsigned, 4> SrcBytes;
13843 auto Src0Mask = Src0s.begin()->second;
13844 SrcBytes.push_back(Src0Mask & 0xFF000000);
13845 bool UniqueEntries = true;
13846 for (auto I = 1; I < 4; I++) {
13847 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
13848
13849 if (is_contained(SrcBytes, NextByte)) {
13850 UniqueEntries = false;
13851 break;
13852 }
13853 SrcBytes.push_back(NextByte);
13854 }
13855
13856 if (UniqueEntries) {
13857 UseOriginalSrc = true;
13858 // Must be 32 bits to enter above conditional.
13859 assert(Src0s.begin()->first.getValueSizeInBits() == 32);
13860 assert(Src1s.begin()->first.getValueSizeInBits() == 32);
13861 Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
13862 Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
13863 }
13864 }
13865
13866 if (!UseOriginalSrc) {
13867 Src0 = resolveSources(DAG, SL, Src0s, false, true);
13868 Src1 = resolveSources(DAG, SL, Src1s, false, true);
13869 }
13870
13871 assert(IsSigned);
13872 SDValue Src2 =
13873 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
13874
13875 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
13876 : Intrinsic::amdgcn_udot4,
13877 SL, MVT::i64);
13878
13879 assert(!VT.isVector());
13880 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
13881 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
13882
13883 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
13884 }
13885
13886 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
13887 return SDValue();
13888
13889 // add x, zext (setcc) => uaddo_carry x, 0, setcc
13890 // add x, sext (setcc) => usubo_carry x, 0, setcc
13891 unsigned Opc = LHS.getOpcode();
13892 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
13893 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
13894 std::swap(RHS, LHS);
13895
13896 Opc = RHS.getOpcode();
13897 switch (Opc) {
13898 default: break;
13899 case ISD::ZERO_EXTEND:
13900 case ISD::SIGN_EXTEND:
13901 case ISD::ANY_EXTEND: {
13902 auto Cond = RHS.getOperand(0);
13903 // If this won't be a real VOPC output, we would still need to insert an
13904 // extra instruction anyway.
13905 if (!isBoolSGPR(Cond))
13906 break;
13907 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
13908 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
13910 return DAG.getNode(Opc, SL, VTList, Args);
13911 }
13912 case ISD::UADDO_CARRY: {
13913 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
13914 if (!isNullConstant(RHS.getOperand(1)))
13915 break;
13916 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
13917 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
13918 }
13919 }
13920 return SDValue();
13921}
13922
13923SDValue SITargetLowering::performSubCombine(SDNode *N,
13924 DAGCombinerInfo &DCI) const {
13925 SelectionDAG &DAG = DCI.DAG;
13926 EVT VT = N->getValueType(0);
13927
13928 if (VT != MVT::i32)
13929 return SDValue();
13930
13931 SDLoc SL(N);
13932 SDValue LHS = N->getOperand(0);
13933 SDValue RHS = N->getOperand(1);
13934
13935 // sub x, zext (setcc) => usubo_carry x, 0, setcc
13936 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
13937 unsigned Opc = RHS.getOpcode();
13938 switch (Opc) {
13939 default: break;
13940 case ISD::ZERO_EXTEND:
13941 case ISD::SIGN_EXTEND:
13942 case ISD::ANY_EXTEND: {
13943 auto Cond = RHS.getOperand(0);
13944 // If this won't be a real VOPC output, we would still need to insert an
13945 // extra instruction anyway.
13946 if (!isBoolSGPR(Cond))
13947 break;
13948 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
13949 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
13951 return DAG.getNode(Opc, SL, VTList, Args);
13952 }
13953 }
13954
13955 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
13956 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
13957 if (!isNullConstant(LHS.getOperand(1)))
13958 return SDValue();
13959 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
13960 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
13961 }
13962 return SDValue();
13963}
13964
13965SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
13966 DAGCombinerInfo &DCI) const {
13967
13968 if (N->getValueType(0) != MVT::i32)
13969 return SDValue();
13970
13971 if (!isNullConstant(N->getOperand(1)))
13972 return SDValue();
13973
13974 SelectionDAG &DAG = DCI.DAG;
13975 SDValue LHS = N->getOperand(0);
13976
13977 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
13978 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
13979 unsigned LHSOpc = LHS.getOpcode();
13980 unsigned Opc = N->getOpcode();
13981 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
13982 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
13983 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
13984 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
13985 }
13986 return SDValue();
13987}
13988
13989SDValue SITargetLowering::performFAddCombine(SDNode *N,
13990 DAGCombinerInfo &DCI) const {
13991 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13992 return SDValue();
13993
13994 SelectionDAG &DAG = DCI.DAG;
13995 EVT VT = N->getValueType(0);
13996
13997 SDLoc SL(N);
13998 SDValue LHS = N->getOperand(0);
13999 SDValue RHS = N->getOperand(1);
14000
14001 // These should really be instruction patterns, but writing patterns with
14002 // source modifiers is a pain.
14003
14004 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14005 if (LHS.getOpcode() == ISD::FADD) {
14006 SDValue A = LHS.getOperand(0);
14007 if (A == LHS.getOperand(1)) {
14008 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14009 if (FusedOp != 0) {
14010 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14011 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14012 }
14013 }
14014 }
14015
14016 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14017 if (RHS.getOpcode() == ISD::FADD) {
14018 SDValue A = RHS.getOperand(0);
14019 if (A == RHS.getOperand(1)) {
14020 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14021 if (FusedOp != 0) {
14022 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14023 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14024 }
14025 }
14026 }
14027
14028 return SDValue();
14029}
14030
14031SDValue SITargetLowering::performFSubCombine(SDNode *N,
14032 DAGCombinerInfo &DCI) const {
14033 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14034 return SDValue();
14035
14036 SelectionDAG &DAG = DCI.DAG;
14037 SDLoc SL(N);
14038 EVT VT = N->getValueType(0);
14039 assert(!VT.isVector());
14040
14041 // Try to get the fneg to fold into the source modifier. This undoes generic
14042 // DAG combines and folds them into the mad.
14043 //
14044 // Only do this if we are not trying to support denormals. v_mad_f32 does
14045 // not support denormals ever.
14046 SDValue LHS = N->getOperand(0);
14047 SDValue RHS = N->getOperand(1);
14048 if (LHS.getOpcode() == ISD::FADD) {
14049 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14050 SDValue A = LHS.getOperand(0);
14051 if (A == LHS.getOperand(1)) {
14052 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14053 if (FusedOp != 0){
14054 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14055 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14056
14057 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14058 }
14059 }
14060 }
14061
14062 if (RHS.getOpcode() == ISD::FADD) {
14063 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14064
14065 SDValue A = RHS.getOperand(0);
14066 if (A == RHS.getOperand(1)) {
14067 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14068 if (FusedOp != 0){
14069 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14070 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14071 }
14072 }
14073 }
14074
14075 return SDValue();
14076}
14077
14078SDValue SITargetLowering::performFDivCombine(SDNode *N,
14079 DAGCombinerInfo &DCI) const {
14080 SelectionDAG &DAG = DCI.DAG;
14081 SDLoc SL(N);
14082 EVT VT = N->getValueType(0);
14083 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14084 return SDValue();
14085
14086 SDValue LHS = N->getOperand(0);
14087 SDValue RHS = N->getOperand(1);
14088
14089 SDNodeFlags Flags = N->getFlags();
14090 SDNodeFlags RHSFlags = RHS->getFlags();
14091 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14092 !RHS->hasOneUse())
14093 return SDValue();
14094
14095 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14096 bool IsNegative = false;
14097 if (CLHS->isExactlyValue(1.0) ||
14098 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14099 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14100 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14101 if (RHS.getOpcode() == ISD::FSQRT) {
14102 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14103 SDValue Rsq =
14104 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14105 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14106 }
14107 }
14108 }
14109
14110 return SDValue();
14111}
14112
14113SDValue SITargetLowering::performFMACombine(SDNode *N,
14114 DAGCombinerInfo &DCI) const {
14115 SelectionDAG &DAG = DCI.DAG;
14116 EVT VT = N->getValueType(0);
14117 SDLoc SL(N);
14118
14119 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14120 return SDValue();
14121
14122 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14123 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14124 SDValue Op1 = N->getOperand(0);
14125 SDValue Op2 = N->getOperand(1);
14126 SDValue FMA = N->getOperand(2);
14127
14128 if (FMA.getOpcode() != ISD::FMA ||
14129 Op1.getOpcode() != ISD::FP_EXTEND ||
14130 Op2.getOpcode() != ISD::FP_EXTEND)
14131 return SDValue();
14132
14133 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14134 // regardless of the denorm mode setting. Therefore,
14135 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14136 const TargetOptions &Options = DAG.getTarget().Options;
14137 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14138 (N->getFlags().hasAllowContract() &&
14139 FMA->getFlags().hasAllowContract())) {
14140 Op1 = Op1.getOperand(0);
14141 Op2 = Op2.getOperand(0);
14142 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14144 return SDValue();
14145
14146 SDValue Vec1 = Op1.getOperand(0);
14147 SDValue Idx1 = Op1.getOperand(1);
14148 SDValue Vec2 = Op2.getOperand(0);
14149
14150 SDValue FMAOp1 = FMA.getOperand(0);
14151 SDValue FMAOp2 = FMA.getOperand(1);
14152 SDValue FMAAcc = FMA.getOperand(2);
14153
14154 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14155 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14156 return SDValue();
14157
14158 FMAOp1 = FMAOp1.getOperand(0);
14159 FMAOp2 = FMAOp2.getOperand(0);
14160 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14162 return SDValue();
14163
14164 SDValue Vec3 = FMAOp1.getOperand(0);
14165 SDValue Vec4 = FMAOp2.getOperand(0);
14166 SDValue Idx2 = FMAOp1.getOperand(1);
14167
14168 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14169 // Idx1 and Idx2 cannot be the same.
14170 Idx1 == Idx2)
14171 return SDValue();
14172
14173 if (Vec1 == Vec2 || Vec3 == Vec4)
14174 return SDValue();
14175
14176 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14177 return SDValue();
14178
14179 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14180 (Vec1 == Vec4 && Vec2 == Vec3)) {
14181 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14182 DAG.getTargetConstant(0, SL, MVT::i1));
14183 }
14184 }
14185 return SDValue();
14186}
14187
14188SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14189 DAGCombinerInfo &DCI) const {
14190 SelectionDAG &DAG = DCI.DAG;
14191 SDLoc SL(N);
14192
14193 SDValue LHS = N->getOperand(0);
14194 SDValue RHS = N->getOperand(1);
14195 EVT VT = LHS.getValueType();
14196 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14197
14198 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14199 if (!CRHS) {
14200 CRHS = dyn_cast<ConstantSDNode>(LHS);
14201 if (CRHS) {
14202 std::swap(LHS, RHS);
14204 }
14205 }
14206
14207 if (CRHS) {
14208 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14209 isBoolSGPR(LHS.getOperand(0))) {
14210 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14211 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14212 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14213 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14214 if ((CRHS->isAllOnes() &&
14215 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14216 (CRHS->isZero() &&
14217 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14218 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14219 DAG.getConstant(-1, SL, MVT::i1));
14220 if ((CRHS->isAllOnes() &&
14221 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14222 (CRHS->isZero() &&
14223 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14224 return LHS.getOperand(0);
14225 }
14226
14227 const APInt &CRHSVal = CRHS->getAPIntValue();
14228 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14229 LHS.getOpcode() == ISD::SELECT &&
14230 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14231 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14232 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14233 isBoolSGPR(LHS.getOperand(0))) {
14234 // Given CT != FT:
14235 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14236 // setcc (select cc, CT, CF), CF, ne => cc
14237 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14238 // setcc (select cc, CT, CF), CT, eq => cc
14239 const APInt &CT = LHS.getConstantOperandAPInt(1);
14240 const APInt &CF = LHS.getConstantOperandAPInt(2);
14241
14242 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14243 (CT == CRHSVal && CC == ISD::SETNE))
14244 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14245 DAG.getConstant(-1, SL, MVT::i1));
14246 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14247 (CT == CRHSVal && CC == ISD::SETEQ))
14248 return LHS.getOperand(0);
14249 }
14250 }
14251
14252 if (VT != MVT::f32 && VT != MVT::f64 &&
14253 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14254 return SDValue();
14255
14256 // Match isinf/isfinite pattern
14257 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14258 // (fcmp one (fabs x), inf) -> (fp_class x,
14259 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14260 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14261 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14262 if (!CRHS)
14263 return SDValue();
14264
14265 const APFloat &APF = CRHS->getValueAPF();
14266 if (APF.isInfinity() && !APF.isNegative()) {
14267 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14269 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14275 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14276 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14277 DAG.getConstant(Mask, SL, MVT::i32));
14278 }
14279 }
14280
14281 return SDValue();
14282}
14283
14284SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14285 DAGCombinerInfo &DCI) const {
14286 SelectionDAG &DAG = DCI.DAG;
14287 SDLoc SL(N);
14288 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14289
14290 SDValue Src = N->getOperand(0);
14291 SDValue Shift = N->getOperand(0);
14292
14293 // TODO: Extend type shouldn't matter (assuming legal types).
14294 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14295 Shift = Shift.getOperand(0);
14296
14297 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14298 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14299 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14300 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14301 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14302 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14303 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14304 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14305 SDLoc(Shift.getOperand(0)), MVT::i32);
14306
14307 unsigned ShiftOffset = 8 * Offset;
14308 if (Shift.getOpcode() == ISD::SHL)
14309 ShiftOffset -= C->getZExtValue();
14310 else
14311 ShiftOffset += C->getZExtValue();
14312
14313 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14314 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14315 MVT::f32, Shifted);
14316 }
14317 }
14318 }
14319
14320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14321 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14322 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14323 // We simplified Src. If this node is not dead, visit it again so it is
14324 // folded properly.
14325 if (N->getOpcode() != ISD::DELETED_NODE)
14326 DCI.AddToWorklist(N);
14327 return SDValue(N, 0);
14328 }
14329
14330 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14331 if (SDValue DemandedSrc =
14333 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14334
14335 return SDValue();
14336}
14337
14338SDValue SITargetLowering::performClampCombine(SDNode *N,
14339 DAGCombinerInfo &DCI) const {
14340 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14341 if (!CSrc)
14342 return SDValue();
14343
14344 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14345 const APFloat &F = CSrc->getValueAPF();
14346 APFloat Zero = APFloat::getZero(F.getSemantics());
14347 if (F < Zero ||
14348 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14349 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14350 }
14351
14352 APFloat One(F.getSemantics(), "1.0");
14353 if (F > One)
14354 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14355
14356 return SDValue(CSrc, 0);
14357}
14358
14359
14361 DAGCombinerInfo &DCI) const {
14362 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14363 return SDValue();
14364 switch (N->getOpcode()) {
14365 case ISD::ADD:
14366 return performAddCombine(N, DCI);
14367 case ISD::SUB:
14368 return performSubCombine(N, DCI);
14369 case ISD::UADDO_CARRY:
14370 case ISD::USUBO_CARRY:
14371 return performAddCarrySubCarryCombine(N, DCI);
14372 case ISD::FADD:
14373 return performFAddCombine(N, DCI);
14374 case ISD::FSUB:
14375 return performFSubCombine(N, DCI);
14376 case ISD::FDIV:
14377 return performFDivCombine(N, DCI);
14378 case ISD::SETCC:
14379 return performSetCCCombine(N, DCI);
14380 case ISD::FMAXNUM:
14381 case ISD::FMINNUM:
14382 case ISD::FMAXNUM_IEEE:
14383 case ISD::FMINNUM_IEEE:
14384 case ISD::FMAXIMUM:
14385 case ISD::FMINIMUM:
14386 case ISD::SMAX:
14387 case ISD::SMIN:
14388 case ISD::UMAX:
14389 case ISD::UMIN:
14392 return performMinMaxCombine(N, DCI);
14393 case ISD::FMA:
14394 return performFMACombine(N, DCI);
14395 case ISD::AND:
14396 return performAndCombine(N, DCI);
14397 case ISD::OR:
14398 return performOrCombine(N, DCI);
14399 case ISD::FSHR: {
14401 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14402 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14403 return matchPERM(N, DCI);
14404 }
14405 break;
14406 }
14407 case ISD::XOR:
14408 return performXorCombine(N, DCI);
14409 case ISD::ZERO_EXTEND:
14410 return performZeroExtendCombine(N, DCI);
14412 return performSignExtendInRegCombine(N , DCI);
14414 return performClassCombine(N, DCI);
14415 case ISD::FCANONICALIZE:
14416 return performFCanonicalizeCombine(N, DCI);
14417 case AMDGPUISD::RCP:
14418 return performRcpCombine(N, DCI);
14419 case ISD::FLDEXP:
14420 case AMDGPUISD::FRACT:
14421 case AMDGPUISD::RSQ:
14424 case AMDGPUISD::RSQ_CLAMP: {
14425 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14426 SDValue Src = N->getOperand(0);
14427 if (Src.isUndef())
14428 return Src;
14429 break;
14430 }
14431 case ISD::SINT_TO_FP:
14432 case ISD::UINT_TO_FP:
14433 return performUCharToFloatCombine(N, DCI);
14434 case ISD::FCOPYSIGN:
14435 return performFCopySignCombine(N, DCI);
14440 return performCvtF32UByteNCombine(N, DCI);
14441 case AMDGPUISD::FMED3:
14442 return performFMed3Combine(N, DCI);
14444 return performCvtPkRTZCombine(N, DCI);
14445 case AMDGPUISD::CLAMP:
14446 return performClampCombine(N, DCI);
14447 case ISD::SCALAR_TO_VECTOR: {
14448 SelectionDAG &DAG = DCI.DAG;
14449 EVT VT = N->getValueType(0);
14450
14451 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14452 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2f16) {
14453 SDLoc SL(N);
14454 SDValue Src = N->getOperand(0);
14455 EVT EltVT = Src.getValueType();
14456 if (EltVT != MVT::i16)
14457 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14458
14459 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14460 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14461 }
14462
14463 break;
14464 }
14466 return performExtractVectorEltCombine(N, DCI);
14468 return performInsertVectorEltCombine(N, DCI);
14469 case ISD::FP_ROUND:
14470 return performFPRoundCombine(N, DCI);
14471 case ISD::LOAD: {
14472 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14473 return Widened;
14474 [[fallthrough]];
14475 }
14476 default: {
14477 if (!DCI.isBeforeLegalize()) {
14478 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14479 return performMemSDNodeCombine(MemNode, DCI);
14480 }
14481
14482 break;
14483 }
14484 }
14485
14487}
14488
14489/// Helper function for adjustWritemask
14490static unsigned SubIdx2Lane(unsigned Idx) {
14491 switch (Idx) {
14492 default: return ~0u;
14493 case AMDGPU::sub0: return 0;
14494 case AMDGPU::sub1: return 1;
14495 case AMDGPU::sub2: return 2;
14496 case AMDGPU::sub3: return 3;
14497 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14498 }
14499}
14500
14501/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14502SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14503 SelectionDAG &DAG) const {
14504 unsigned Opcode = Node->getMachineOpcode();
14505
14506 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14507 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14508 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14509 return Node; // not implemented for D16
14510
14511 SDNode *Users[5] = { nullptr };
14512 unsigned Lane = 0;
14513 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14514 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14515 unsigned NewDmask = 0;
14516 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14517 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14518 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14519 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14520 ? true
14521 : false;
14522 unsigned TFCLane = 0;
14523 bool HasChain = Node->getNumValues() > 1;
14524
14525 if (OldDmask == 0) {
14526 // These are folded out, but on the chance it happens don't assert.
14527 return Node;
14528 }
14529
14530 unsigned OldBitsSet = llvm::popcount(OldDmask);
14531 // Work out which is the TFE/LWE lane if that is enabled.
14532 if (UsesTFC) {
14533 TFCLane = OldBitsSet;
14534 }
14535
14536 // Try to figure out the used register components
14537 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14538 I != E; ++I) {
14539
14540 // Don't look at users of the chain.
14541 if (I.getUse().getResNo() != 0)
14542 continue;
14543
14544 // Abort if we can't understand the usage
14545 if (!I->isMachineOpcode() ||
14546 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14547 return Node;
14548
14549 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14550 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14551 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14552 // set, etc.
14553 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14554 if (Lane == ~0u)
14555 return Node;
14556
14557 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14558 if (UsesTFC && Lane == TFCLane) {
14559 Users[Lane] = *I;
14560 } else {
14561 // Set which texture component corresponds to the lane.
14562 unsigned Comp;
14563 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14564 Comp = llvm::countr_zero(Dmask);
14565 Dmask &= ~(1 << Comp);
14566 }
14567
14568 // Abort if we have more than one user per component.
14569 if (Users[Lane])
14570 return Node;
14571
14572 Users[Lane] = *I;
14573 NewDmask |= 1 << Comp;
14574 }
14575 }
14576
14577 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14578 bool NoChannels = !NewDmask;
14579 if (NoChannels) {
14580 if (!UsesTFC) {
14581 // No uses of the result and not using TFC. Then do nothing.
14582 return Node;
14583 }
14584 // If the original dmask has one channel - then nothing to do
14585 if (OldBitsSet == 1)
14586 return Node;
14587 // Use an arbitrary dmask - required for the instruction to work
14588 NewDmask = 1;
14589 }
14590 // Abort if there's no change
14591 if (NewDmask == OldDmask)
14592 return Node;
14593
14594 unsigned BitsSet = llvm::popcount(NewDmask);
14595
14596 // Check for TFE or LWE - increase the number of channels by one to account
14597 // for the extra return value
14598 // This will need adjustment for D16 if this is also included in
14599 // adjustWriteMask (this function) but at present D16 are excluded.
14600 unsigned NewChannels = BitsSet + UsesTFC;
14601
14602 int NewOpcode =
14603 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14604 assert(NewOpcode != -1 &&
14605 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14606 "failed to find equivalent MIMG op");
14607
14608 // Adjust the writemask in the node
14610 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14611 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14612 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14613
14614 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14615
14616 MVT ResultVT = NewChannels == 1 ?
14617 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14618 NewChannels == 5 ? 8 : NewChannels);
14619 SDVTList NewVTList = HasChain ?
14620 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14621
14622
14623 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14624 NewVTList, Ops);
14625
14626 if (HasChain) {
14627 // Update chain.
14628 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14629 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14630 }
14631
14632 if (NewChannels == 1) {
14633 assert(Node->hasNUsesOfValue(1, 0));
14634 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14635 SDLoc(Node), Users[Lane]->getValueType(0),
14636 SDValue(NewNode, 0));
14637 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14638 return nullptr;
14639 }
14640
14641 // Update the users of the node with the new indices
14642 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14643 SDNode *User = Users[i];
14644 if (!User) {
14645 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14646 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14647 if (i || !NoChannels)
14648 continue;
14649 } else {
14650 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14651 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14652 if (NewUser != User) {
14653 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14654 DAG.RemoveDeadNode(User);
14655 }
14656 }
14657
14658 switch (Idx) {
14659 default: break;
14660 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14661 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14662 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14663 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14664 }
14665 }
14666
14667 DAG.RemoveDeadNode(Node);
14668 return nullptr;
14669}
14670
14672 if (Op.getOpcode() == ISD::AssertZext)
14673 Op = Op.getOperand(0);
14674
14675 return isa<FrameIndexSDNode>(Op);
14676}
14677
14678/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14679/// with frame index operands.
14680/// LLVM assumes that inputs are to these instructions are registers.
14682 SelectionDAG &DAG) const {
14683 if (Node->getOpcode() == ISD::CopyToReg) {
14684 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14685 SDValue SrcVal = Node->getOperand(2);
14686
14687 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14688 // to try understanding copies to physical registers.
14689 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14690 SDLoc SL(Node);
14692 SDValue VReg = DAG.getRegister(
14693 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14694
14695 SDNode *Glued = Node->getGluedNode();
14696 SDValue ToVReg
14697 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14698 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14699 SDValue ToResultReg
14700 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
14701 VReg, ToVReg.getValue(1));
14702 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
14703 DAG.RemoveDeadNode(Node);
14704 return ToResultReg.getNode();
14705 }
14706 }
14707
14709 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
14710 if (!isFrameIndexOp(Node->getOperand(i))) {
14711 Ops.push_back(Node->getOperand(i));
14712 continue;
14713 }
14714
14715 SDLoc DL(Node);
14716 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
14717 Node->getOperand(i).getValueType(),
14718 Node->getOperand(i)), 0));
14719 }
14720
14721 return DAG.UpdateNodeOperands(Node, Ops);
14722}
14723
14724/// Fold the instructions after selecting them.
14725/// Returns null if users were already updated.
14727 SelectionDAG &DAG) const {
14729 unsigned Opcode = Node->getMachineOpcode();
14730
14731 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
14732 !TII->isGather4(Opcode) &&
14733 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
14734 return adjustWritemask(Node, DAG);
14735 }
14736
14737 if (Opcode == AMDGPU::INSERT_SUBREG ||
14738 Opcode == AMDGPU::REG_SEQUENCE) {
14740 return Node;
14741 }
14742
14743 switch (Opcode) {
14744 case AMDGPU::V_DIV_SCALE_F32_e64:
14745 case AMDGPU::V_DIV_SCALE_F64_e64: {
14746 // Satisfy the operand register constraint when one of the inputs is
14747 // undefined. Ordinarily each undef value will have its own implicit_def of
14748 // a vreg, so force these to use a single register.
14749 SDValue Src0 = Node->getOperand(1);
14750 SDValue Src1 = Node->getOperand(3);
14751 SDValue Src2 = Node->getOperand(5);
14752
14753 if ((Src0.isMachineOpcode() &&
14754 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
14755 (Src0 == Src1 || Src0 == Src2))
14756 break;
14757
14758 MVT VT = Src0.getValueType().getSimpleVT();
14759 const TargetRegisterClass *RC =
14760 getRegClassFor(VT, Src0.getNode()->isDivergent());
14761
14763 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
14764
14765 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
14766 UndefReg, Src0, SDValue());
14767
14768 // src0 must be the same register as src1 or src2, even if the value is
14769 // undefined, so make sure we don't violate this constraint.
14770 if (Src0.isMachineOpcode() &&
14771 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
14772 if (Src1.isMachineOpcode() &&
14773 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
14774 Src0 = Src1;
14775 else if (Src2.isMachineOpcode() &&
14776 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
14777 Src0 = Src2;
14778 else {
14779 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
14780 Src0 = UndefReg;
14781 Src1 = UndefReg;
14782 }
14783 } else
14784 break;
14785
14786 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
14787 Ops[1] = Src0;
14788 Ops[3] = Src1;
14789 Ops[5] = Src2;
14790 Ops.push_back(ImpDef.getValue(1));
14791 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
14792 }
14793 default:
14794 break;
14795 }
14796
14797 return Node;
14798}
14799
14800// Any MIMG instructions that use tfe or lwe require an initialization of the
14801// result register that will be written in the case of a memory access failure.
14802// The required code is also added to tie this init code to the result of the
14803// img instruction.
14806 const SIRegisterInfo &TRI = TII->getRegisterInfo();
14807 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
14808 MachineBasicBlock &MBB = *MI.getParent();
14809
14810 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
14811 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
14812 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
14813
14814 if (!TFE && !LWE) // intersect_ray
14815 return;
14816
14817 unsigned TFEVal = TFE ? TFE->getImm() : 0;
14818 unsigned LWEVal = LWE ? LWE->getImm() : 0;
14819 unsigned D16Val = D16 ? D16->getImm() : 0;
14820
14821 if (!TFEVal && !LWEVal)
14822 return;
14823
14824 // At least one of TFE or LWE are non-zero
14825 // We have to insert a suitable initialization of the result value and
14826 // tie this to the dest of the image instruction.
14827
14828 const DebugLoc &DL = MI.getDebugLoc();
14829
14830 int DstIdx =
14831 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
14832
14833 // Calculate which dword we have to initialize to 0.
14834 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
14835
14836 // check that dmask operand is found.
14837 assert(MO_Dmask && "Expected dmask operand in instruction");
14838
14839 unsigned dmask = MO_Dmask->getImm();
14840 // Determine the number of active lanes taking into account the
14841 // Gather4 special case
14842 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
14843
14844 bool Packed = !Subtarget->hasUnpackedD16VMem();
14845
14846 unsigned InitIdx =
14847 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
14848
14849 // Abandon attempt if the dst size isn't large enough
14850 // - this is in fact an error but this is picked up elsewhere and
14851 // reported correctly.
14852 uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
14853 if (DstSize < InitIdx)
14854 return;
14855
14856 // Create a register for the initialization value.
14857 Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
14858 unsigned NewDst = 0; // Final initialized value will be in here
14859
14860 // If PRTStrictNull feature is enabled (the default) then initialize
14861 // all the result registers to 0, otherwise just the error indication
14862 // register (VGPRn+1)
14863 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
14864 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
14865
14866 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
14867 for (; SizeLeft; SizeLeft--, CurrIdx++) {
14868 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
14869 // Initialize dword
14870 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
14871 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
14872 .addImm(0);
14873 // Insert into the super-reg
14874 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
14875 .addReg(PrevDst)
14876 .addReg(SubReg)
14878
14879 PrevDst = NewDst;
14880 }
14881
14882 // Add as an implicit operand
14883 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
14884
14885 // Tie the just added implicit operand to the dst
14886 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
14887}
14888
14889/// Assign the register class depending on the number of
14890/// bits set in the writemask
14892 SDNode *Node) const {
14894
14895 MachineFunction *MF = MI.getParent()->getParent();
14898
14899 if (TII->isVOP3(MI.getOpcode())) {
14900 // Make sure constant bus requirements are respected.
14901 TII->legalizeOperandsVOP3(MRI, MI);
14902
14903 // Prefer VGPRs over AGPRs in mAI instructions where possible.
14904 // This saves a chain-copy of registers and better balance register
14905 // use between vgpr and agpr as agpr tuples tend to be big.
14906 if (!MI.getDesc().operands().empty()) {
14907 unsigned Opc = MI.getOpcode();
14908 bool HasAGPRs = Info->mayNeedAGPRs();
14909 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
14910 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
14911 for (auto I :
14912 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
14913 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
14914 if (I == -1)
14915 break;
14916 if ((I == Src2Idx) && (HasAGPRs))
14917 break;
14918 MachineOperand &Op = MI.getOperand(I);
14919 if (!Op.isReg() || !Op.getReg().isVirtual())
14920 continue;
14921 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
14922 if (!TRI->hasAGPRs(RC))
14923 continue;
14924 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
14925 if (!Src || !Src->isCopy() ||
14926 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
14927 continue;
14928 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
14929 // All uses of agpr64 and agpr32 can also accept vgpr except for
14930 // v_accvgpr_read, but we do not produce agpr reads during selection,
14931 // so no use checks are needed.
14932 MRI.setRegClass(Op.getReg(), NewRC);
14933 }
14934
14935 if (!HasAGPRs)
14936 return;
14937
14938 // Resolve the rest of AV operands to AGPRs.
14939 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
14940 if (Src2->isReg() && Src2->getReg().isVirtual()) {
14941 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
14942 if (TRI->isVectorSuperClass(RC)) {
14943 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
14944 MRI.setRegClass(Src2->getReg(), NewRC);
14945 if (Src2->isTied())
14946 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
14947 }
14948 }
14949 }
14950 }
14951
14952 return;
14953 }
14954
14955 if (TII->isImage(MI)) {
14956 if (!MI.mayStore())
14957 AddIMGInit(MI);
14958 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
14959 }
14960}
14961
14963 uint64_t Val) {
14964 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
14965 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
14966}
14967
14969 const SDLoc &DL,
14970 SDValue Ptr) const {
14972
14973 // Build the half of the subregister with the constants before building the
14974 // full 128-bit register. If we are building multiple resource descriptors,
14975 // this will allow CSEing of the 2-component register.
14976 const SDValue Ops0[] = {
14977 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
14978 buildSMovImm32(DAG, DL, 0),
14979 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
14980 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
14981 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
14982 };
14983
14984 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
14985 MVT::v2i32, Ops0), 0);
14986
14987 // Combine the constants and the pointer.
14988 const SDValue Ops1[] = {
14989 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
14990 Ptr,
14991 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
14992 SubRegHi,
14993 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
14994 };
14995
14996 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
14997}
14998
14999/// Return a resource descriptor with the 'Add TID' bit enabled
15000/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15001/// of the resource descriptor) to create an offset, which is added to
15002/// the resource pointer.
15004 SDValue Ptr, uint32_t RsrcDword1,
15005 uint64_t RsrcDword2And3) const {
15006 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15007 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15008 if (RsrcDword1) {
15009 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15010 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15011 0);
15012 }
15013
15014 SDValue DataLo = buildSMovImm32(DAG, DL,
15015 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15016 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15017
15018 const SDValue Ops[] = {
15019 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15020 PtrLo,
15021 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15022 PtrHi,
15023 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15024 DataLo,
15025 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15026 DataHi,
15027 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15028 };
15029
15030 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15031}
15032
15033//===----------------------------------------------------------------------===//
15034// SI Inline Assembly Support
15035//===----------------------------------------------------------------------===//
15036
15037std::pair<unsigned, const TargetRegisterClass *>
15039 StringRef Constraint,
15040 MVT VT) const {
15041 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15042
15043 const TargetRegisterClass *RC = nullptr;
15044 if (Constraint.size() == 1) {
15045 const unsigned BitWidth = VT.getSizeInBits();
15046 switch (Constraint[0]) {
15047 default:
15048 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15049 case 's':
15050 case 'r':
15051 switch (BitWidth) {
15052 case 16:
15053 RC = &AMDGPU::SReg_32RegClass;
15054 break;
15055 case 64:
15056 RC = &AMDGPU::SGPR_64RegClass;
15057 break;
15058 default:
15060 if (!RC)
15061 return std::pair(0U, nullptr);
15062 break;
15063 }
15064 break;
15065 case 'v':
15066 switch (BitWidth) {
15067 case 16:
15068 RC = &AMDGPU::VGPR_32RegClass;
15069 break;
15070 default:
15071 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15072 if (!RC)
15073 return std::pair(0U, nullptr);
15074 break;
15075 }
15076 break;
15077 case 'a':
15078 if (!Subtarget->hasMAIInsts())
15079 break;
15080 switch (BitWidth) {
15081 case 16:
15082 RC = &AMDGPU::AGPR_32RegClass;
15083 break;
15084 default:
15085 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15086 if (!RC)
15087 return std::pair(0U, nullptr);
15088 break;
15089 }
15090 break;
15091 }
15092 // We actually support i128, i16 and f16 as inline parameters
15093 // even if they are not reported as legal
15094 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15095 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15096 return std::pair(0U, RC);
15097 }
15098
15099 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15100 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15101 if (RegName.consume_front("v")) {
15102 RC = &AMDGPU::VGPR_32RegClass;
15103 } else if (RegName.consume_front("s")) {
15104 RC = &AMDGPU::SGPR_32RegClass;
15105 } else if (RegName.consume_front("a")) {
15106 RC = &AMDGPU::AGPR_32RegClass;
15107 }
15108
15109 if (RC) {
15110 uint32_t Idx;
15111 if (RegName.consume_front("[")) {
15112 uint32_t End;
15113 bool Failed = RegName.consumeInteger(10, Idx);
15114 Failed |= !RegName.consume_front(":");
15115 Failed |= RegName.consumeInteger(10, End);
15116 Failed |= !RegName.consume_back("]");
15117 if (!Failed) {
15118 uint32_t Width = (End - Idx + 1) * 32;
15119 MCRegister Reg = RC->getRegister(Idx);
15121 RC = TRI->getVGPRClassForBitWidth(Width);
15122 else if (SIRegisterInfo::isSGPRClass(RC))
15123 RC = TRI->getSGPRClassForBitWidth(Width);
15124 else if (SIRegisterInfo::isAGPRClass(RC))
15125 RC = TRI->getAGPRClassForBitWidth(Width);
15126 if (RC) {
15127 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15128 return std::pair(Reg, RC);
15129 }
15130 }
15131 } else {
15132 bool Failed = RegName.getAsInteger(10, Idx);
15133 if (!Failed && Idx < RC->getNumRegs())
15134 return std::pair(RC->getRegister(Idx), RC);
15135 }
15136 }
15137 }
15138
15139 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15140 if (Ret.first)
15141 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15142
15143 return Ret;
15144}
15145
15146static bool isImmConstraint(StringRef Constraint) {
15147 if (Constraint.size() == 1) {
15148 switch (Constraint[0]) {
15149 default: break;
15150 case 'I':
15151 case 'J':
15152 case 'A':
15153 case 'B':
15154 case 'C':
15155 return true;
15156 }
15157 } else if (Constraint == "DA" ||
15158 Constraint == "DB") {
15159 return true;
15160 }
15161 return false;
15162}
15163
15166 if (Constraint.size() == 1) {
15167 switch (Constraint[0]) {
15168 default: break;
15169 case 's':
15170 case 'v':
15171 case 'a':
15172 return C_RegisterClass;
15173 }
15174 }
15175 if (isImmConstraint(Constraint)) {
15176 return C_Other;
15177 }
15178 return TargetLowering::getConstraintType(Constraint);
15179}
15180
15181static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15183 Val = Val & maskTrailingOnes<uint64_t>(Size);
15184 }
15185 return Val;
15186}
15187
15189 StringRef Constraint,
15190 std::vector<SDValue> &Ops,
15191 SelectionDAG &DAG) const {
15192 if (isImmConstraint(Constraint)) {
15193 uint64_t Val;
15194 if (getAsmOperandConstVal(Op, Val) &&
15195 checkAsmConstraintVal(Op, Constraint, Val)) {
15196 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15197 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15198 }
15199 } else {
15200 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15201 }
15202}
15203
15205 unsigned Size = Op.getScalarValueSizeInBits();
15206 if (Size > 64)
15207 return false;
15208
15209 if (Size == 16 && !Subtarget->has16BitInsts())
15210 return false;
15211
15212 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15213 Val = C->getSExtValue();
15214 return true;
15215 }
15216 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15217 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15218 return true;
15219 }
15220 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15221 if (Size != 16 || Op.getNumOperands() != 2)
15222 return false;
15223 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15224 return false;
15225 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15226 Val = C->getSExtValue();
15227 return true;
15228 }
15229 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15230 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15231 return true;
15232 }
15233 }
15234
15235 return false;
15236}
15237
15239 uint64_t Val) const {
15240 if (Constraint.size() == 1) {
15241 switch (Constraint[0]) {
15242 case 'I':
15244 case 'J':
15245 return isInt<16>(Val);
15246 case 'A':
15247 return checkAsmConstraintValA(Op, Val);
15248 case 'B':
15249 return isInt<32>(Val);
15250 case 'C':
15251 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15253 default:
15254 break;
15255 }
15256 } else if (Constraint.size() == 2) {
15257 if (Constraint == "DA") {
15258 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15259 int64_t LoBits = static_cast<int32_t>(Val);
15260 return checkAsmConstraintValA(Op, HiBits, 32) &&
15261 checkAsmConstraintValA(Op, LoBits, 32);
15262 }
15263 if (Constraint == "DB") {
15264 return true;
15265 }
15266 }
15267 llvm_unreachable("Invalid asm constraint");
15268}
15269
15271 uint64_t Val,
15272 unsigned MaxSize) const {
15273 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15274 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15275 if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
15276 (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15277 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
15278 return true;
15279 }
15280 return false;
15281}
15282
15283static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15284 switch (UnalignedClassID) {
15285 case AMDGPU::VReg_64RegClassID:
15286 return AMDGPU::VReg_64_Align2RegClassID;
15287 case AMDGPU::VReg_96RegClassID:
15288 return AMDGPU::VReg_96_Align2RegClassID;
15289 case AMDGPU::VReg_128RegClassID:
15290 return AMDGPU::VReg_128_Align2RegClassID;
15291 case AMDGPU::VReg_160RegClassID:
15292 return AMDGPU::VReg_160_Align2RegClassID;
15293 case AMDGPU::VReg_192RegClassID:
15294 return AMDGPU::VReg_192_Align2RegClassID;
15295 case AMDGPU::VReg_224RegClassID:
15296 return AMDGPU::VReg_224_Align2RegClassID;
15297 case AMDGPU::VReg_256RegClassID:
15298 return AMDGPU::VReg_256_Align2RegClassID;
15299 case AMDGPU::VReg_288RegClassID:
15300 return AMDGPU::VReg_288_Align2RegClassID;
15301 case AMDGPU::VReg_320RegClassID:
15302 return AMDGPU::VReg_320_Align2RegClassID;
15303 case AMDGPU::VReg_352RegClassID:
15304 return AMDGPU::VReg_352_Align2RegClassID;
15305 case AMDGPU::VReg_384RegClassID:
15306 return AMDGPU::VReg_384_Align2RegClassID;
15307 case AMDGPU::VReg_512RegClassID:
15308 return AMDGPU::VReg_512_Align2RegClassID;
15309 case AMDGPU::VReg_1024RegClassID:
15310 return AMDGPU::VReg_1024_Align2RegClassID;
15311 case AMDGPU::AReg_64RegClassID:
15312 return AMDGPU::AReg_64_Align2RegClassID;
15313 case AMDGPU::AReg_96RegClassID:
15314 return AMDGPU::AReg_96_Align2RegClassID;
15315 case AMDGPU::AReg_128RegClassID:
15316 return AMDGPU::AReg_128_Align2RegClassID;
15317 case AMDGPU::AReg_160RegClassID:
15318 return AMDGPU::AReg_160_Align2RegClassID;
15319 case AMDGPU::AReg_192RegClassID:
15320 return AMDGPU::AReg_192_Align2RegClassID;
15321 case AMDGPU::AReg_256RegClassID:
15322 return AMDGPU::AReg_256_Align2RegClassID;
15323 case AMDGPU::AReg_512RegClassID:
15324 return AMDGPU::AReg_512_Align2RegClassID;
15325 case AMDGPU::AReg_1024RegClassID:
15326 return AMDGPU::AReg_1024_Align2RegClassID;
15327 default:
15328 return -1;
15329 }
15330}
15331
15332// Figure out which registers should be reserved for stack access. Only after
15333// the function is legalized do we know all of the non-spill stack objects or if
15334// calls are present.
15338 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15339 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15340 const SIInstrInfo *TII = ST.getInstrInfo();
15341
15342 if (Info->isEntryFunction()) {
15343 // Callable functions have fixed registers used for stack access.
15345 }
15346
15347 // TODO: Move this logic to getReservedRegs()
15348 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15349 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15350 Register SReg = ST.isWave32()
15351 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15352 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15353 &AMDGPU::SGPR_64RegClass);
15354 Info->setSGPRForEXECCopy(SReg);
15355
15356 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15357 Info->getStackPtrOffsetReg()));
15358 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15359 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15360
15361 // We need to worry about replacing the default register with itself in case
15362 // of MIR testcases missing the MFI.
15363 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15364 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15365
15366 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15367 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15368
15369 Info->limitOccupancy(MF);
15370
15371 if (ST.isWave32() && !MF.empty()) {
15372 for (auto &MBB : MF) {
15373 for (auto &MI : MBB) {
15374 TII->fixImplicitOperands(MI);
15375 }
15376 }
15377 }
15378
15379 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15380 // classes if required. Ideally the register class constraints would differ
15381 // per-subtarget, but there's no easy way to achieve that right now. This is
15382 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15383 // from using them as the register class for legal types.
15384 if (ST.needsAlignedVGPRs()) {
15385 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15386 const Register Reg = Register::index2VirtReg(I);
15387 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15388 if (!RC)
15389 continue;
15390 int NewClassID = getAlignedAGPRClassID(RC->getID());
15391 if (NewClassID != -1)
15392 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15393 }
15394 }
15395
15397}
15398
15400 KnownBits &Known,
15401 const APInt &DemandedElts,
15402 const SelectionDAG &DAG,
15403 unsigned Depth) const {
15404 Known.resetAll();
15405 unsigned Opc = Op.getOpcode();
15406 switch (Opc) {
15408 unsigned IID = Op.getConstantOperandVal(0);
15409 switch (IID) {
15410 case Intrinsic::amdgcn_mbcnt_lo:
15411 case Intrinsic::amdgcn_mbcnt_hi: {
15412 const GCNSubtarget &ST =
15414 // These return at most the (wavefront size - 1) + src1
15415 // As long as src1 is an immediate we can calc known bits
15416 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15417 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15418 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15419 // Cater for potential carry
15420 MaxActiveBits += Src1ValBits ? 1 : 0;
15421 unsigned Size = Op.getValueType().getSizeInBits();
15422 if (MaxActiveBits < Size)
15423 Known.Zero.setHighBits(Size - MaxActiveBits);
15424 return;
15425 }
15426 }
15427 break;
15428 }
15429 }
15431 Op, Known, DemandedElts, DAG, Depth);
15432}
15433
15435 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15437
15438 // Set the high bits to zero based on the maximum allowed scratch size per
15439 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15440 // calculation won't overflow, so assume the sign bit is never set.
15441 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15442}
15443
15445 KnownBits &Known, unsigned Dim) {
15446 unsigned MaxValue =
15447 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15448 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15449}
15450
15452 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15453 const MachineRegisterInfo &MRI, unsigned Depth) const {
15454 const MachineInstr *MI = MRI.getVRegDef(R);
15455 switch (MI->getOpcode()) {
15456 case AMDGPU::G_INTRINSIC:
15457 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15458 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15459 case Intrinsic::amdgcn_workitem_id_x:
15460 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15461 break;
15462 case Intrinsic::amdgcn_workitem_id_y:
15463 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15464 break;
15465 case Intrinsic::amdgcn_workitem_id_z:
15466 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15467 break;
15468 case Intrinsic::amdgcn_mbcnt_lo:
15469 case Intrinsic::amdgcn_mbcnt_hi: {
15470 // These return at most the wavefront size - 1.
15471 unsigned Size = MRI.getType(R).getSizeInBits();
15472 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15473 break;
15474 }
15475 case Intrinsic::amdgcn_groupstaticsize: {
15476 // We can report everything over the maximum size as 0. We can't report
15477 // based on the actual size because we don't know if it's accurate or not
15478 // at any given point.
15479 Known.Zero.setHighBits(
15480 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15481 break;
15482 }
15483 }
15484 break;
15485 }
15486 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15487 Known.Zero.setHighBits(24);
15488 break;
15489 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15490 Known.Zero.setHighBits(16);
15491 break;
15492 case AMDGPU::G_AMDGPU_SMED3:
15493 case AMDGPU::G_AMDGPU_UMED3: {
15494 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15495
15496 KnownBits Known2;
15497 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15498 if (Known2.isUnknown())
15499 break;
15500
15501 KnownBits Known1;
15502 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15503 if (Known1.isUnknown())
15504 break;
15505
15506 KnownBits Known0;
15507 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15508 if (Known0.isUnknown())
15509 break;
15510
15511 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15512 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15513 Known.One = Known0.One & Known1.One & Known2.One;
15514 break;
15515 }
15516 }
15517}
15518
15521 unsigned Depth) const {
15522 const MachineInstr *MI = MRI.getVRegDef(R);
15523 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15524 // FIXME: Can this move to generic code? What about the case where the call
15525 // site specifies a lower alignment?
15526 Intrinsic::ID IID = GI->getIntrinsicID();
15528 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15529 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15530 return *RetAlign;
15531 }
15532 return Align(1);
15533}
15534
15537 const Align CacheLineAlign = Align(64);
15538
15539 // Pre-GFX10 target did not benefit from loop alignment
15540 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15541 getSubtarget()->hasInstFwdPrefetchBug())
15542 return PrefAlign;
15543
15544 // On GFX10 I$ is 4 x 64 bytes cache lines.
15545 // By default prefetcher keeps one cache line behind and reads two ahead.
15546 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15547 // behind and one ahead.
15548 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15549 // If loop fits 64 bytes it always spans no more than two cache lines and
15550 // does not need an alignment.
15551 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15552 // Else if loop is less or equal 192 bytes we need two lines behind.
15553
15555 const MachineBasicBlock *Header = ML->getHeader();
15556 if (Header->getAlignment() != PrefAlign)
15557 return Header->getAlignment(); // Already processed.
15558
15559 unsigned LoopSize = 0;
15560 for (const MachineBasicBlock *MBB : ML->blocks()) {
15561 // If inner loop block is aligned assume in average half of the alignment
15562 // size to be added as nops.
15563 if (MBB != Header)
15564 LoopSize += MBB->getAlignment().value() / 2;
15565
15566 for (const MachineInstr &MI : *MBB) {
15567 LoopSize += TII->getInstSizeInBytes(MI);
15568 if (LoopSize > 192)
15569 return PrefAlign;
15570 }
15571 }
15572
15573 if (LoopSize <= 64)
15574 return PrefAlign;
15575
15576 if (LoopSize <= 128)
15577 return CacheLineAlign;
15578
15579 // If any of parent loops is surrounded by prefetch instructions do not
15580 // insert new for inner loop, which would reset parent's settings.
15581 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15582 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15583 auto I = Exit->getFirstNonDebugInstr();
15584 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15585 return CacheLineAlign;
15586 }
15587 }
15588
15589 MachineBasicBlock *Pre = ML->getLoopPreheader();
15590 MachineBasicBlock *Exit = ML->getExitBlock();
15591
15592 if (Pre && Exit) {
15593 auto PreTerm = Pre->getFirstTerminator();
15594 if (PreTerm == Pre->begin() ||
15595 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15596 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15597 .addImm(1); // prefetch 2 lines behind PC
15598
15599 auto ExitHead = Exit->getFirstNonDebugInstr();
15600 if (ExitHead == Exit->end() ||
15601 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15602 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15603 .addImm(2); // prefetch 1 line behind PC
15604 }
15605
15606 return CacheLineAlign;
15607}
15608
15610static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15611 assert(N->getOpcode() == ISD::CopyFromReg);
15612 do {
15613 // Follow the chain until we find an INLINEASM node.
15614 N = N->getOperand(0).getNode();
15615 if (N->getOpcode() == ISD::INLINEASM ||
15616 N->getOpcode() == ISD::INLINEASM_BR)
15617 return true;
15618 } while (N->getOpcode() == ISD::CopyFromReg);
15619 return false;
15620}
15621
15624 UniformityInfo *UA) const {
15625 switch (N->getOpcode()) {
15626 case ISD::CopyFromReg: {
15627 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15628 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15629 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15630 Register Reg = R->getReg();
15631
15632 // FIXME: Why does this need to consider isLiveIn?
15633 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15634 return !TRI->isSGPRReg(MRI, Reg);
15635
15636 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15637 return UA->isDivergent(V);
15638
15640 return !TRI->isSGPRReg(MRI, Reg);
15641 }
15642 case ISD::LOAD: {
15643 const LoadSDNode *L = cast<LoadSDNode>(N);
15644 unsigned AS = L->getAddressSpace();
15645 // A flat load may access private memory.
15647 }
15648 case ISD::CALLSEQ_END:
15649 return true;
15651 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15653 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15675 // Target-specific read-modify-write atomics are sources of divergence.
15676 return true;
15677 default:
15678 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
15679 // Generic read-modify-write atomics are sources of divergence.
15680 return A->readMem() && A->writeMem();
15681 }
15682 return false;
15683 }
15684}
15685
15687 EVT VT) const {
15688 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
15689 case MVT::f32:
15691 case MVT::f64:
15692 case MVT::f16:
15694 default:
15695 return false;
15696 }
15697}
15698
15700 MachineFunction &MF) const {
15701 switch (Ty.getScalarSizeInBits()) {
15702 case 32:
15703 return !denormalModeIsFlushAllF32(MF);
15704 case 64:
15705 case 16:
15706 return !denormalModeIsFlushAllF64F16(MF);
15707 default:
15708 return false;
15709 }
15710}
15711
15713 const SelectionDAG &DAG,
15714 bool SNaN,
15715 unsigned Depth) const {
15716 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
15717 const MachineFunction &MF = DAG.getMachineFunction();
15719
15720 if (Info->getMode().DX10Clamp)
15721 return true; // Clamped to 0.
15722 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
15723 }
15724
15726 SNaN, Depth);
15727}
15728
15729// Global FP atomic instructions have a hardcoded FP mode and do not support
15730// FP32 denormals, and only support v2f16 denormals.
15733 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
15734 if (&Flt == &APFloat::IEEEsingle())
15735 return DenormMode == DenormalMode::getPreserveSign();
15736 return DenormMode == DenormalMode::getIEEE();
15737}
15738
15739// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
15740// floating point atomic instructions. May generate more efficient code,
15741// but may not respect rounding and denormal modes, and may give incorrect
15742// results for certain memory destinations.
15744 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
15745 "true";
15746}
15747
15750 unsigned AS = RMW->getPointerAddressSpace();
15751 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
15753
15754 auto SSID = RMW->getSyncScopeID();
15755
15756 auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
15758 LLVMContext &Ctx = RMW->getFunction()->getContext();
15760 Ctx.getSyncScopeNames(SSNs);
15761 auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
15762 ? "system"
15763 : SSNs[RMW->getSyncScopeID()];
15764 ORE.emit([&]() {
15765 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
15766 << "Hardware instruction generated for atomic "
15767 << RMW->getOperationName(RMW->getOperation())
15768 << " operation at memory scope " << MemScope
15769 << " due to an unsafe request.";
15770 });
15771 return Kind;
15772 };
15773
15774 bool HasSystemScope =
15775 SSID == SyncScope::System ||
15776 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
15777
15778 switch (RMW->getOperation()) {
15779 case AtomicRMWInst::FAdd: {
15780 Type *Ty = RMW->getType();
15781
15782 if (Ty->isHalfTy())
15784
15785 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
15787
15789 Subtarget->hasAtomicFaddNoRtnInsts()) {
15790 if (Subtarget->hasGFX940Insts())
15792
15795
15796 // Always expand system scope fp atomics.
15797 if (HasSystemScope)
15799
15800 if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) {
15801 // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
15802 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
15803 return ReportUnsafeHWInst(AtomicExpansionKind::None);
15804 // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
15805 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
15806 return ReportUnsafeHWInst(AtomicExpansionKind::None);
15807 }
15808
15809 // flat atomic fadd f32: gfx940, gfx11+.
15810 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
15811 Subtarget->hasFlatAtomicFaddF32Inst())
15812 return ReportUnsafeHWInst(AtomicExpansionKind::None);
15813
15814 // global and flat atomic fadd f64: gfx90a, gfx940.
15815 if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
15816 return ReportUnsafeHWInst(AtomicExpansionKind::None);
15817
15818 // If it is in flat address space, and the type is float, we will try to
15819 // expand it, if the target supports global and lds atomic fadd. The
15820 // reason we need that is, in the expansion, we emit the check of address
15821 // space. If it is in global address space, we emit the global atomic
15822 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
15823 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
15824 Subtarget->hasLDSFPAtomicAdd()) {
15825 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
15827 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
15829 }
15830
15832 }
15833
15834 // DS FP atomics do respect the denormal mode, but the rounding mode is
15835 // fixed to round-to-nearest-even.
15836 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
15837 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
15838 if (!Ty->isDoubleTy())
15840
15843
15844 return RMW->getFunction()
15845 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
15846 .getValueAsString() == "true"
15847 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
15849 }
15850
15852 }
15855 case AtomicRMWInst::Min:
15856 case AtomicRMWInst::Max:
15858 case AtomicRMWInst::UMax: {
15860 if (RMW->getType()->isFloatTy() &&
15863
15864 // Always expand system scope min/max atomics.
15865 if (HasSystemScope)
15867 }
15868 break;
15869 }
15870 default:
15871 break;
15872 }
15873
15875}
15876
15882}
15883
15886 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
15889}
15890
15896}
15897
15898const TargetRegisterClass *
15899SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
15901 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15902 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
15903 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
15904 : &AMDGPU::SReg_32RegClass;
15905 if (!TRI->isSGPRClass(RC) && !isDivergent)
15906 return TRI->getEquivalentSGPRClass(RC);
15907 else if (TRI->isSGPRClass(RC) && isDivergent)
15908 return TRI->getEquivalentVGPRClass(RC);
15909
15910 return RC;
15911}
15912
15913// FIXME: This is a workaround for DivergenceAnalysis not understanding always
15914// uniform values (as produced by the mask results of control flow intrinsics)
15915// used outside of divergent blocks. The phi users need to also be treated as
15916// always uniform.
15917//
15918// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
15919static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
15920 unsigned WaveSize) {
15921 // FIXME: We assume we never cast the mask results of a control flow
15922 // intrinsic.
15923 // Early exit if the type won't be consistent as a compile time hack.
15924 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
15925 if (!IT || IT->getBitWidth() != WaveSize)
15926 return false;
15927
15928 if (!isa<Instruction>(V))
15929 return false;
15930 if (!Visited.insert(V).second)
15931 return false;
15932 bool Result = false;
15933 for (const auto *U : V->users()) {
15934 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
15935 if (V == U->getOperand(1)) {
15936 switch (Intrinsic->getIntrinsicID()) {
15937 default:
15938 Result = false;
15939 break;
15940 case Intrinsic::amdgcn_if_break:
15941 case Intrinsic::amdgcn_if:
15942 case Intrinsic::amdgcn_else:
15943 Result = true;
15944 break;
15945 }
15946 }
15947 if (V == U->getOperand(0)) {
15948 switch (Intrinsic->getIntrinsicID()) {
15949 default:
15950 Result = false;
15951 break;
15952 case Intrinsic::amdgcn_end_cf:
15953 case Intrinsic::amdgcn_loop:
15954 Result = true;
15955 break;
15956 }
15957 }
15958 } else {
15959 Result = hasCFUser(U, Visited, WaveSize);
15960 }
15961 if (Result)
15962 break;
15963 }
15964 return Result;
15965}
15966
15968 const Value *V) const {
15969 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
15970 if (CI->isInlineAsm()) {
15971 // FIXME: This cannot give a correct answer. This should only trigger in
15972 // the case where inline asm returns mixed SGPR and VGPR results, used
15973 // outside the defining block. We don't have a specific result to
15974 // consider, so this assumes if any value is SGPR, the overall register
15975 // also needs to be SGPR.
15976 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
15978 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
15979 for (auto &TC : TargetConstraints) {
15980 if (TC.Type == InlineAsm::isOutput) {
15983 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
15984 if (RC && SIRI->isSGPRClass(RC))
15985 return true;
15986 }
15987 }
15988 }
15989 }
15991 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
15992}
15993
15995 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
15996 for (; I != E; ++I) {
15997 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
15998 if (getBasePtrIndex(M) == I.getOperandNo())
15999 return true;
16000 }
16001 }
16002 return false;
16003}
16004
16006 SDValue N1) const {
16007 if (!N0.hasOneUse())
16008 return false;
16009 // Take care of the opportunity to keep N0 uniform
16010 if (N0->isDivergent() || !N1->isDivergent())
16011 return true;
16012 // Check if we have a good chance to form the memory access pattern with the
16013 // base and offset
16014 return (DAG.isBaseWithConstantOffset(N0) &&
16015 hasMemSDNodeUser(*N0->use_begin()));
16016}
16017
16019 Register N0, Register N1) const {
16020 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16021}
16022
16025 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16026 if (I.getMetadata("amdgpu.noclobber"))
16027 return MONoClobber;
16029}
16030
16032 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16033 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16034 if (User->getOpcode() != ISD::CopyToReg)
16035 return false;
16036 if (!Def->isMachineOpcode())
16037 return false;
16038 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16039 if (!MDef)
16040 return false;
16041
16042 unsigned ResNo = User->getOperand(Op).getResNo();
16043 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16044 return false;
16045 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16046 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16047 PhysReg = AMDGPU::SCC;
16048 const TargetRegisterClass *RC =
16049 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16050 Cost = RC->getCopyCost();
16051 return true;
16052 }
16053 return false;
16054}
16055
16057 assert(Subtarget->hasAtomicFaddInsts() &&
16058 "target should have atomic fadd instructions");
16059 assert(AI->getType()->isFloatTy() &&
16061 "generic atomicrmw expansion only supports FP32 operand in flat "
16062 "address space");
16064 "only fadd is supported for now");
16065
16066 // Given: atomicrmw fadd ptr %addr, float %val ordering
16067 //
16068 // With this expansion we produce the following code:
16069 // [...]
16070 // br label %atomicrmw.check.shared
16071 //
16072 // atomicrmw.check.shared:
16073 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16074 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16075 //
16076 // atomicrmw.shared:
16077 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16078 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16079 // float %val ordering
16080 // br label %atomicrmw.phi
16081 //
16082 // atomicrmw.check.private:
16083 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16084 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16085 //
16086 // atomicrmw.private:
16087 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16088 // %loaded.private = load float, ptr addrspace(5) %cast.private
16089 // %val.new = fadd float %loaded.private, %val
16090 // store float %val.new, ptr addrspace(5) %cast.private
16091 // br label %atomicrmw.phi
16092 //
16093 // atomicrmw.global:
16094 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16095 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16096 // float %val ordering
16097 // br label %atomicrmw.phi
16098 //
16099 // atomicrmw.phi:
16100 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16101 // [ %loaded.private, %atomicrmw.private ],
16102 // [ %loaded.global, %atomicrmw.global ]
16103 // br label %atomicrmw.end
16104 //
16105 // atomicrmw.end:
16106 // [...]
16107
16108 IRBuilder<> Builder(AI);
16109 LLVMContext &Ctx = Builder.getContext();
16110
16111 BasicBlock *BB = Builder.GetInsertBlock();
16112 Function *F = BB->getParent();
16113 BasicBlock *ExitBB =
16114 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16115 BasicBlock *CheckSharedBB =
16116 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16117 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16118 BasicBlock *CheckPrivateBB =
16119 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16120 BasicBlock *PrivateBB =
16121 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16122 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16123 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16124
16125 Value *Val = AI->getValOperand();
16126 Type *ValTy = Val->getType();
16127 Value *Addr = AI->getPointerOperand();
16128
16129 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16130 Value *Val) -> Value * {
16131 AtomicRMWInst *OldVal =
16132 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16133 AI->getOrdering(), AI->getSyncScopeID());
16135 AI->getAllMetadata(MDs);
16136 for (auto &P : MDs)
16137 OldVal->setMetadata(P.first, P.second);
16138 return OldVal;
16139 };
16140
16141 std::prev(BB->end())->eraseFromParent();
16142 Builder.SetInsertPoint(BB);
16143 Builder.CreateBr(CheckSharedBB);
16144
16145 Builder.SetInsertPoint(CheckSharedBB);
16146 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16147 {Addr}, nullptr, "is.shared");
16148 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16149
16150 Builder.SetInsertPoint(SharedBB);
16151 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16153 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16154 Builder.CreateBr(PhiBB);
16155
16156 Builder.SetInsertPoint(CheckPrivateBB);
16157 CallInst *IsPrivate = Builder.CreateIntrinsic(
16158 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16159 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16160
16161 Builder.SetInsertPoint(PrivateBB);
16162 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16164 Value *LoadedPrivate =
16165 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16166 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16167 Builder.CreateStore(NewVal, CastToPrivate);
16168 Builder.CreateBr(PhiBB);
16169
16170 Builder.SetInsertPoint(GlobalBB);
16171 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16173 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16174 Builder.CreateBr(PhiBB);
16175
16176 Builder.SetInsertPoint(PhiBB);
16177 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16178 Loaded->addIncoming(LoadedShared, SharedBB);
16179 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16180 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16181 Builder.CreateBr(ExitBB);
16182
16183 AI->replaceAllUsesWith(Loaded);
16184 AI->eraseFromParent();
16185}
16186
16187LoadInst *
16189 IRBuilder<> Builder(AI);
16190 auto Order = AI->getOrdering();
16191
16192 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16193 // must be flushed if the atomic ordering had a release semantics. This is
16194 // not necessary a fence, a release fence just coincides to do that flush.
16195 // Avoid replacing of an atomicrmw with a release semantics.
16196 if (isReleaseOrStronger(Order))
16197 return nullptr;
16198
16199 LoadInst *LI = Builder.CreateAlignedLoad(
16200 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16201 LI->setAtomic(Order, AI->getSyncScopeID());
16202 LI->copyMetadata(*AI);
16203 LI->takeName(AI);
16204 AI->replaceAllUsesWith(LI);
16205 AI->eraseFromParent();
16206 return LI;
16207}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:478
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1172
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1169
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< std::pair< SDValue, unsigned > > &Srcs, unsigned ChainLength)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< std::pair< SDValue, unsigned > > &Srcs, bool IsSigned, bool IsAny)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< std::pair< SDValue, unsigned > > &Src0s, SmallVectorImpl< std::pair< SDValue, unsigned > > &Src1s, int Step)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1583
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1209
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1193
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:521
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:662
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:726
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:842
@ FAdd
*p = old + v
Definition: Instructions.h:763
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:754
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:760
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:774
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:758
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:770
Value * getPointerOperand()
Definition: Instructions.h:885
BinOp getOperation() const
Definition: Instructions.h:820
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:876
Value * getValOperand()
Definition: Instructions.h:889
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:862
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:893
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:318
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:450
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:607
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1481
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1567
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1426
unsigned arg_size() const
Definition: InstrTypes.h:1424
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:780
@ ICMP_NE
not equal
Definition: InstrTypes.h:802
bool isSigned() const
Definition: InstrTypes.h:1030
bool isFPPredicate() const
Definition: InstrTypes.h:887
bool isIntPredicate() const
Definition: InstrTypes.h:888
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
iterator_range< arg_iterator > args()
Definition: Function.h:834
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:692
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:341
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:725
bool hasPrefetch() const
Definition: GCNSubtarget.h:862
bool hasD16Images() const
Definition: GCNSubtarget.h:661
bool hasImageStoreD16Bug() const
Definition: GCNSubtarget.h:996
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:455
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:446
bool hasDot7Insts() const
Definition: GCNSubtarget.h:760
bool hasApertureRegs() const
Definition: GCNSubtarget.h:575
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:603
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:730
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:389
bool hasMAIInsts() const
Definition: GCNSubtarget.h:776
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:641
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:505
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:563
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:244
bool hasDot1Insts() const
Definition: GCNSubtarget.h:736
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:798
Align getStackAlignment() const
Definition: GCNSubtarget.h:875
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:433
bool enableFlatScratch() const
Definition: GCNSubtarget.h:628
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:599
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:439
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:814
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:256
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:706
bool useDS128() const
Definition: GCNSubtarget.h:515
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:435
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:248
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:567
bool hasLDSFPAtomicAdd() const
Definition: GCNSubtarget.h:933
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:405
bool hasIntClamp() const
Definition: GCNSubtarget.h:335
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:963
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:355
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:579
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:607
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:888
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:695
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:314
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:842
bool hasFFBL() const
Definition: GCNSubtarget.h:393
bool hasNSAEncoding() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:537
bool hasMed3_16() const
Definition: GCNSubtarget.h:401
bool hasMovrel() const
Definition: GCNSubtarget.h:909
bool needsKernargPreloadBackwardsCompatibility() const
bool hasBFI() const
Definition: GCNSubtarget.h:381
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:555
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:322
bool hasImageGather4D16Bug() const
Definition: GCNSubtarget.h:998
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:500
bool hasFFBH() const
Definition: GCNSubtarget.h:397
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:794
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:800
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:923
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:525
bool hasDot8Insts() const
Definition: GCNSubtarget.h:764
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:520
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:509
Generation getGeneration() const
Definition: GCNSubtarget.h:295
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:693
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:697
bool hasAddr64() const
Definition: GCNSubtarget.h:359
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:409
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:689
bool hasFractBug() const
Definition: GCNSubtarget.h:373
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:377
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:676
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:510
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1806
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1527
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:930
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2380
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1114
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1789
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1802
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1853
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1108
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:328
const BasicBlock * getParent() const
Definition: Instruction.h:139
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:93
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1627
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:364
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:257
constexpr bool isScalar() const
Definition: LowLevelType.h:139
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:49
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:183
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:211
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:177
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:270
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:250
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1059
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:581
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the first non-debug instruction in the basic block, or end().
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:68
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
bool hasAtomicFaddRtnForTy(SDValue &Op) const
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
void AddIMGInit(MachineInstr &MI) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:720
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:952
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:470
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:826
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:771
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:674
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:797
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:843
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:301
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:849
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:271
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
TargetOptions Options
bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:369
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:332
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:81
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width)
void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:402
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:409
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:194
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:265
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:229
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:246
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:242
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1120
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1261
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1263
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1233
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1264
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:977
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1029
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:480
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1259
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1260
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:937
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1383
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1266
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1045
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1213
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:986
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:928
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1075
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1262
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1229
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1014
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1257
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
Definition: ISDOpcodes.h:976
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1203
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1240
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1265
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:969
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1039
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1095
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1271
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1255
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:982
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1256
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1180
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1200
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1254
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:942
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:414
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1092
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1068
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1270
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1506
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1486
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1010
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1508
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:270
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:219
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2047
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:236
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:212
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:373
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:113
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:283
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:226
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:448
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:624
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:160
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:306
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:239
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:311
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:66
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:287
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:239
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals